In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics.pairwise import pairwise_distances, euclidean_distances
%matplotlib inline

## Loading data

In [2]:
# open data & preprocess
data = pd.read_csv('20171121_data.csv', index_col=0)
exercise = pd.read_csv('exerciseId.tsv', sep='\t', index_col=0)
chapter = pd.read_csv('20171122_chapter.tsv', sep='\t')
data['subject_id'] = data['chapter_id'].map(chapter['subject_id'])
data['beginningDate'] = pd.to_datetime(data['beginningDate'])
data['endDate'] = pd.to_datetime(data['endDate'])
data['spentTime'] = data['endDate'] - data['beginningDate']
data['spentTime'] = data['spentTime'].dt.total_seconds()

  mask |= (ar1 == a)


In [3]:
# Merging exercise_id with all data
df = data.merge(exercise, how='left', left_index=True, right_index=True)
df = df[np.isfinite(df['user_id']) & np.isfinite(df['exercise_id'])].drop_duplicates() #ATTENTION user_id manquants
dff = df.reset_index().drop_duplicates('id').set_index('id') #drop duplicates indices ### PROBLEME SUPPRIME

# Encoding question types
le = preprocessing.LabelEncoder()
dff['type_ae'] = le.fit_transform(dff['type'])

In [4]:
# General informations
print('nb user %s' % "{:,}".format(len(dff.user_id.unique())))
print('nb exercise %s' % "{:,}".format(len(dff.exercise_id.unique()))) 
print('nb users x nb exercises %s' % "{:,}".format(len(dff.user_id.unique())*len(dff.exercise_id.unique()))) 

nb user 24,966
nb exercise 15,372
nb users x nb exercises 383,777,352


## Adjacency matrix

### Graph nodes

In [5]:
## HYPOTHESIS : we keep only the first answer of a question by a student 
## (if he has done it multiple times)

In [6]:
graph = dff.drop_duplicates(['user_id',
                            'exercise_id'], keep ='first').groupby(['exercise_id']).agg({
                                                                    'user_id':'count',
                                                                     'difficulty' : 'mean', 
                                                                     'valid':'mean',
                                                                     'spentTime':'mean',
                                                                     'skip':'mean', 
                                                                     'topic_id':'mean', 
                                                                     'type_ae':'mean'})#.reset_index()
g = graph.dropna() # NA on difficulty : 15264 non-null for 15372 exercise_id
# keeping only exercise_id with non-null difficulty
saved_ex_id = list(g.index) 

### History matrix

In [7]:
# Question history
hist = dff.drop_duplicates(['user_id','exercise_id'], keep ='first').groupby(['exercise_id','user_id'])['valid'].mean()
hist = hist.to_frame().reset_index()
# keeping only exercise_id with non-null difficulty
hist = hist[hist.exercise_id.isin(saved_ex_id)]
# done questions : valid/not valid
h = hist.pivot(index = 'exercise_id', columns = 'user_id', values = 'valid') 
h = h.fillna(-100) # undone questions 
h = h.astype(int)

In [8]:
h.sample(5)[np.random.choice(h.columns, 10)]

user_id,105776.0,148457.0,156926.0,141461.0,160793.0,148214.0,200029.0,158283.0,183213.0,173410.0
exercise_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
8472,-100,-100,-100,-100,-100,-100,-100,-100,-100,-100
41664,-100,-100,-100,-100,-100,-100,-100,-100,-100,-100
9385,-100,-100,-100,-100,-100,-100,-100,-100,-100,-100
8048,-100,-100,-100,-100,-100,-100,-100,-100,-100,-100
40061,-100,-100,-100,-100,-100,-100,1,-100,-100,-100


### Graph edges

In [9]:
# preprocessing for the 5 categorical features
g_ = g[['skip','valid','type_ae','difficulty','topic_id']]
x = g_.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
g2 = pd.DataFrame(x_scaled)

# distance for the 5 features
dist_notime = pairwise_distances(g2, metric='l1') #L1 norm
# distance for the last continuous feature : spentTime
g_time = g[['spentTime']]
dist_time = pairwise_distances(g_time, metric='canberra') #canberra : somme de (soustraction /somme) 2
# distance for all the 6 features
dist = dist_time + dist_notime

In [10]:
# quantile de "crédibilité" : nb de fois que la question a été répondue
q75 = g.user_id.quantile(q=0.75)
q25 = g.user_id.quantile(q=0.25)
ex_slice1 = list(g[g.user_id < q25].index)
ex_slice2 = list(g[(g.user_id >= q25) & (g.user_id < q75)].index)
ex_slice3 = list(g[g.user_id >= q75].index)

# création de 3 tranches
g_user = g[['user_id']].copy()
g_user.iloc[g_user.index.isin(ex_slice1)] = 1
g_user.iloc[g_user.index.isin(ex_slice2)] = 2
g_user.iloc[g_user.index.isin(ex_slice3)] = 3

# distance 
k = pairwise_distances(g_user, metric='l1')
credi = pd.DataFrame(1-k/10) #0.8, 0.9 1
credi.index = g.index
credi.columns = g.index

In [11]:
sim = 1 - dist/6 #similarité : 1 - normalized distance
W = pd.DataFrame(sim)
W.index = g.index
W.columns = g.index

## SIMILARITY MATRIX
W = W.mul(credi)

In [12]:
idx = np.random.choice(W.columns, 5)
W[idx].loc[idx]

exercise_id,6611,1344,10400,13228,7566
exercise_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6611,1.0,0.679534,0.651781,0.716909,0.88595
1344,0.679534,1.0,0.770108,0.515745,0.58267
10400,0.651781,0.770108,1.0,0.706212,0.593423
13228,0.716909,0.515745,0.706212,1.0,0.671095
7566,0.88595,0.58267,0.593423,0.671095,1.0


### Saving matrices

In [13]:
W.to_pickle('adjacency.pkl')
hist.to_pickle('hist.pkl')