-
Notifications
You must be signed in to change notification settings - Fork 0
/
input_csv.py
193 lines (175 loc) · 8.57 KB
/
input_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_input_csv.ipynb.
# %% auto 0
__all__ = ['import_csv', 'get_all_verbs', 'get_all_actors', 'get_all_objects', 'remove_whitespaces', 'to_lowercase',
'remove_actors', 'remove_verbs', 'count_interactions', 'create_barplot', 'subset_actor_verb', 'split_column',
'average_interactions']
# %% ../nbs/01_input_csv.ipynb 4
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from typing import Set, List, Union
from pathlib import Path
from datetime import datetime
from fastcore.test import *
# %% ../nbs/01_input_csv.ipynb 10
import ast
def import_csv(csv_file: Union[str, Path], # Filename of the csv with the data
index_col: int = 0, # The index column
delimiter: str = ',', # the column delimiter
quotechar: str = '"' # Quoting char. Ignore delimiter between this character
) -> pd.DataFrame: # The imported dataframe with all the xAPI statements
"""
Reads a csv file and perform some processing to make the data easier to read as well as
easier to process afterwards. Returns a pandas Dataframe
"""
def to_dict(x):
try:
y = ast.literal_eval(x)
if type(y) == dict:
return y
except:
return None
def to_datetime(x):
try:
y = datetime.strptime(timestamp_str, "%Y-%m-%dT%H:%M:%S.%f%z")
return y
except:
return x
def extract_value(x):
if x is None:
return x
if 'en-us' in x:
return x['en-us']
elif 'en-US' in x:
return x['en-US']
else:
return x
if not Path(csv_file).exists():
print("The specified file does not exist. Creating an empty DataFrame...")
return pd.DataFrame()
else:
df = pd.read_csv(csv_file, index_col=index_col, delimiter=delimiter, quotechar=quotechar,
converters = {'timestamp': to_datetime, 'verb display': to_dict, 'object name': to_dict})
df = df.reset_index(drop=True)
if 'actor' not in df.columns and 'actor name' in df.columns:
df.rename(columns={"actor name": "actor"}, inplace=True)
if 'verb' not in df.columns and 'verb display' in df.columns:
df['verb'] = df['verb display'].map(extract_value)
df.drop(columns=['verb display'], inplace=True)
if 'object' not in df.columns and 'object name' in df.columns:
df['object'] = df['object name'].map(extract_value)
df.drop(columns=['object name'], inplace=True)
df.drop(columns=['lrs_id', 'verb id', 'object id'], inplace=True, errors='ignore')
return df
# %% ../nbs/01_input_csv.ipynb 20
def get_all_verbs(df: pd.DataFrame # The dataset containing the xAPI statements (one statement per row)
) -> Set: # Set containing all the verbs occurring in the dataset
"""
Returns a set with all verbs in the dataset
"""
return set(df["verb"].unique())
# %% ../nbs/01_input_csv.ipynb 23
def get_all_actors(df: pd.DataFrame # The dataset containing the xAPI statements (one statement per row)
) -> Set: # Set containing all the actors occurring in the dataset
"""
Returns a set with all actors in the dataset
"""
return set(df["actor"].unique())
# %% ../nbs/01_input_csv.ipynb 25
def get_all_objects(df: pd.DataFrame # The dataset containing the xAPI statements (one statement per row)
) -> Set: # Set containing all the objects occurring in the dataset
"""
Returns a set with all objects in the dataset
"""
return set(df["object"].unique())
# %% ../nbs/01_input_csv.ipynb 27
def remove_whitespaces(df: pd.DataFrame, # The dataset containing the xAPI statements (one statement per row)
cols: List # the columns on which whitespaces should be removed
) -> pd.DataFrame: # The dataframe after applying the function
"""
Removes whitespaces from the specified columns in the dataframe.
"""
df[cols] = df[cols].apply(lambda s : s.str.replace(" ", ""))
return df
# %% ../nbs/01_input_csv.ipynb 28
def to_lowercase(df: pd.DataFrame, # The dataset containing the xAPI statements (one statement per row)
cols: List # the columns whose content should be made lowercase
) -> pd.DataFrame: # The dataframe after applying the function
"""
Converts to lowercase the elements in the specified columns.
The function only applies to columnns whose type is *str*
"""
df[cols] = df[cols].applymap(lambda s: s.lower() if type(s) == str else s)
return df
# %% ../nbs/01_input_csv.ipynb 31
def remove_actors(df: pd.DataFrame, # The dataset containing the xAPI statements (one statement per row)
cols: List # the list of actors to remove
) -> pd.DataFrame: # The dataframe with the specified actors removed
"""
Removes from the dataframe all the rows whose actor is in the specified list
"""
return df[~df['actor'].isin(cols)]
# %% ../nbs/01_input_csv.ipynb 33
def remove_verbs(df: pd.DataFrame, # The dataset containing the xAPI statements (one statement per row)
cols: List # the list of verbs to remove
) -> pd.DataFrame: # The dataframe with the specified verbs removed
"""
Removes from the dataframe all the rows whose actor is in the specified list
"""
return df[~df['verb'].isin(cols)]
# %% ../nbs/01_input_csv.ipynb 38
def count_interactions(df: pd.DataFrame, # The dataset containing the xAPI statements (one statement per row)
) -> pd.DataFrame: # A dataframe with the number of interactions of each actor
"""
Creates a new dataframe counting the total number of statements associated to each actor
"""
tmp = df.groupby(['actor'])["verb"].agg(['count']).sort_values("count")
tmp.reset_index(inplace=True)
return tmp
# %% ../nbs/01_input_csv.ipynb 41
def create_barplot(df: pd.DataFrame, # The input dataset
x: str, # the column with the numerical variable to be plotted
y: str, # the column with the name associated to each value
cmap: str = 'flare' # the color palette to be used
):
"""
Creates an horizontal barplot of the data in the dataframe
"""
sns.barplot(x=x, y=y, data=df, palette=cmap)
# %% ../nbs/01_input_csv.ipynb 44
def subset_actor_verb(df: pd.DataFrame, # The dataset containing the xAPI statements (one statement per row)
actor: str, # The actor we are interested in
verb: str # The verb we are interested in
) -> pd.DataFrame: # A dataframe containing only the statements with a specific actor and verb
"""
Returns the subset of the original dataframe containing only statements with the specified actor and verb
"""
return df[(df["actor"]==actor) & (df["verb"]==verb)]
# %% ../nbs/01_input_csv.ipynb 47
def split_column(df: pd.DataFrame, # The dataset containing the xAPI statements (one statement per row)
col: str, # The column in the dataset that should be split into multiple columns
col_names: List, # The names of the columns created after split
sep: str =';', # The separator between fiels inside the column we want to split
) -> pd.DataFrame: # A dataframe with the content *col* cplit into several columns
"""
Splits the column of the DataFrame into multiple columns, and return a new data
"""
new_df = df[col].str.split(sep, expand=True)
if len(new_df.columns) == len(col_names):
new_df.columns = col_names
return new_df
else:
print("The length of col_names should match the number of generated columns")
return pd.DataFrame()
# %% ../nbs/01_input_csv.ipynb 49
def average_interactions(df: pd.DataFrame, # The dataset containing the xAPI statements (one statement per row)
avg_col: str, # The column on which to compute average
user_col: str = 'actor' # The column to groupby (usually **actor**)
) -> pd.DataFrame: # A new dataframe with the average of the interaction per specific value
"""
Similar to `count_interactions`, but here creates a new dataframe averaging the statements
associated to a specific column
"""
new_df = df.groupby(user_col, as_index=False)[avg_col].mean().sort_values(avg_col)
return new_df