In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics.pairwise import pairwise_distances, euclidean_distances
%matplotlib inline

## Loading data

In [2]:
# open data & preprocess
data = pd.read_csv('20171121_data.csv', index_col=0)
exercise = pd.read_csv('exerciseId.tsv', sep='\t', index_col=0)
chapter = pd.read_csv('20171122_chapter.tsv', sep='\t')
data['subject_id'] = data['chapter_id'].map(chapter['subject_id'])
data['beginningDate'] = pd.to_datetime(data['beginningDate'])
data['endDate'] = pd.to_datetime(data['endDate'])
data['spentTime'] = data['endDate'] - data['beginningDate']
data['spentTime'] = data['spentTime'].dt.total_seconds()

  mask |= (ar1 == a)


In [3]:
subject = pd.read_csv('20171122_subject.tsv', sep='\t')
subject = subject[['id','fullTitle']]

In [4]:
# Merging exercise_id with all data
df = data.merge(exercise, how='left', left_index=True, right_index=True)
df = df[np.isfinite(df['user_id']) & np.isfinite(df['exercise_id'])].drop_duplicates() #ATTENTION user_id manquants
dff = df.reset_index().drop_duplicates('id').set_index('id') #drop duplicates indices ### PROBLEME SUPPRIME

# Encoding question types
le = preprocessing.LabelEncoder()
dff['type_ae'] = le.fit_transform(dff['type'])

In [5]:
# General informations
print('nb user %s' % "{:,}".format(len(dff.user_id.unique())))
print('nb exercise %s' % "{:,}".format(len(dff.exercise_id.unique()))) 
print('nb users x nb exercises %s' % "{:,}".format(len(dff.user_id.unique())*len(dff.exercise_id.unique()))) 

nb user 24,966
nb exercise 15,372
nb users x nb exercises 383,777,352


## Adjacency matrix

### Graph nodes

In [6]:
## HYPOTHESIS : we keep only the first answer of a question by a student 
## (if he has done it multiple times)

In [7]:
graph = dff.drop_duplicates(['user_id',
                            'exercise_id'], keep ='first').groupby(['exercise_id']).agg({
                                                                    'user_id':'count',
                                                                     'difficulty' : 'mean', 
                                                                     'valid':'mean',
                                                                     'spentTime':'mean',
                                                                     'skip':'mean', 
                                                                     'topic_id':'mean', 
                                                                     'type_ae':'mean'})#.reset_index()
g = graph.dropna() # NA on difficulty : 15264 non-null for 15372 exercise_id
# keeping only exercise_id with non-null difficulty
saved_ex_id = list(g.index) 

In [8]:
g.describe()

Unnamed: 0,skip,user_id,valid,topic_id,difficulty,type_ae,spentTime
count,15264.0,15264.0,15264.0,15264.0,15264.0,15264.0,15264.0
mean,0.057552,331.52103,0.461161,523.538915,0.491981,2.322655,154.8057
std,0.096089,469.535926,0.261242,545.583299,0.227834,1.138739,12820.09
min,0.0,1.0,0.0,1.0,0.2,0.0,-45.0
25%,0.008929,34.0,0.252393,139.0,0.2,2.0,20.07669
50%,0.025571,166.0,0.461538,298.0,0.5,3.0,31.58426
75%,0.066896,430.0,0.666667,747.0,0.8,3.0,52.28206
max,1.0,4555.0,1.0,1861.0,0.8,4.0,1582158.0


### History matrix

In [9]:
# Question history
hist = dff.drop_duplicates(['user_id','exercise_id'], keep ='first').groupby(['exercise_id','user_id'])['valid'].mean()
hist = hist.to_frame().reset_index()
# keeping only exercise_id with non-null difficulty
hist = hist[hist.exercise_id.isin(saved_ex_id)]

In [None]:
# done questions : valid/not valid
h = hist.pivot(index = 'exercise_id', columns = 'user_id', values = 'valid') 
h = h.fillna(-100) # undone questions 
h = h.astype(int)

### Graph edges

In [None]:
# preprocessing for the 5 categorical features
g_ = g[['skip','valid','type_ae','difficulty','topic_id']]
x = g_.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
g2 = pd.DataFrame(x_scaled)

# distance for the 5 features
dist_notime = pairwise_distances(g2, metric='l1') #L1 norm
# distance for the last continuous feature : spentTime
g_time = g[['spentTime']]
dist_time = pairwise_distances(g_time, metric='canberra') #canberra : somme de (soustraction /somme) 2
# distance for all the 6 features
dist = dist_time + dist_notime

In [None]:
# quantile de "crédibilité" : nb de fois que la question a été répondue
q75 = g.user_id.quantile(q=0.75)
q25 = g.user_id.quantile(q=0.25)
ex_slice1 = list(g[g.user_id < q25].index)
ex_slice2 = list(g[(g.user_id >= q25) & (g.user_id < q75)].index)
ex_slice3 = list(g[g.user_id >= q75].index)

# création de 3 tranches
g_user = g[['user_id']].copy()
g_user.iloc[g_user.index.isin(ex_slice1)] = 1
g_user.iloc[g_user.index.isin(ex_slice2)] = 2
g_user.iloc[g_user.index.isin(ex_slice3)] = 3

# distance 
k = pairwise_distances(g_user, metric='l1')
credi = pd.DataFrame(1-k/10) #0.8, 0.9 1
credi.index = g.index
credi.columns = g.index

In [None]:
sim = 1 - dist/6 #similarité : 1 - normalized distance
W = pd.DataFrame(sim)
W.index = g.index
W.columns = g.index

## SIMILARITY MATRIX
W = W.mul(credi)

In [None]:
idx = np.random.choice(W.columns, 5)
W[idx].loc[idx]

### Focusing on only one subject

In [None]:
list_ex_hist_3e = dff[dff['subject_id']==1]['exercise_id'].unique()
W_small = W[W.index.isin(list_ex_hist_3e)][list_ex_hist_3e]

In [None]:
h_small = h[h.index.isin(list_ex_hist_3e)]
h_small = h_small.loc[:, (h_small != -100).any()]
print(W_small.shape, h_small.shape)

### Saving matrices

In [None]:
W.to_pickle('adjacency.pkl')
h.to_hdf('history.hdf', key = 'hist')

In [None]:
W_small.to_pickle('adjacency_small.pkl')
h_small.to_hdf('history_small.hdf', key = 'hist')

### For kegra

In [22]:
node = g[['topic_id','spentTime','skip','user_id','difficulty','type_ae']]
node = node.as_matrix() #cora.content
node
link = hist[['exercise_id','user_id']]
link = link.as_matrix() #cora.cites
node.shape, link.shape

((15264, 6), (5060337, 2))

In [None]:
#student = 142954.0