In [71]:
import pandas as pd
import numpy as np

## Pre-processing

In [72]:
student_data = pd.read_csv("astudentData.csv")
student_data.head(n=5)

Unnamed: 0,question_id,user_id,correct
0,13,63994,0
1,13,7633,0
2,13,33056,0
3,13,3714,1
4,13,17535,1


In [73]:
student_data = student_data.reset_index()
student_data = student_data.drop('index', 1)
student_data = student_data.drop_duplicates()

In [74]:
student_data_matrix = student_data.pivot_table(index = "user_id", columns = "question_id",values="correct")
student_data_matrix.head(n=5)

question_id,13,21,26,27,28,37,44,45,53,54,...,15065,15071,15142,15147,15403,15412,15413,15415,15865,15872
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34,,,,,,,,,,1.0,...,,,,,,,,,,
324,,,,1.0,1.0,,,,1.0,,...,,,,,,,,,,
346,,,,,,,,,,,...,,,,,,,,,,
350,,,1.0,0.0,,,,,1.0,1.0,...,,,,,,,,,,
363,,,,,,0.0,,,,,...,,,,,,,,,,


### Relative Ability estimate

#### User Mean calculation

In [75]:
User_total_score = student_data_matrix.sum(axis=1,skipna=True)
User_nresp = student_data_matrix.count(axis=1)
User_mean = User_total_score/User_nresp
User_mean.head(n=5)

user_id
34     0.811321
324    0.736842
346    0.222222
350    0.489583
363    0.409091
dtype: float64

#### Item Mean Calculation

In [76]:
Item_nresp = student_data_matrix.count(axis=0)
Item_score = student_data_matrix.sum(axis=0,skipna=True)
Item_mean = Item_score / Item_nresp
Item_mean.head(n=5)

question_id
13    0.673749
21    0.693931
26    0.692102
27    0.600000
28    0.749750
dtype: float64

#### User Relative Ability

In [77]:
def colRelative(x):
    ser = x.notnull()
    col = student_data_matrix.columns
    Item_names = col[ser]
    Obs_mean = sum(Item_mean[Item_names])/len(Item_mean[Item_names])
    return Obs_mean

In [78]:
relative_perf = student_data_matrix.apply(colRelative,axis=1)
relative_perf.head(n=5)

user_id
34     0.660103
324    0.650735
346    0.687002
350    0.656134
363    0.661626
dtype: float64

In [79]:
User_weight=User_mean/relative_perf
User_weight.head(n=5)

user_id
34     1.229083
324    1.132323
346    0.323466
350    0.746164
363    0.618312
dtype: float64

In [80]:
student_data_sample = student_data_matrix.copy()
student_data_sample.head(n=5)

question_id,13,21,26,27,28,37,44,45,53,54,...,15065,15071,15142,15147,15403,15412,15413,15415,15865,15872
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34,,,,,,,,,,1.0,...,,,,,,,,,,
324,,,,1.0,1.0,,,,1.0,,...,,,,,,,,,,
346,,,,,,,,,,,...,,,,,,,,,,
350,,,1.0,0.0,,,,,1.0,1.0,...,,,,,,,,,,
363,,,,,,0.0,,,,,...,,,,,,,,,,


In [81]:
misclass = 0
total_val = 0
for i in student_data_sample.columns:
    output_data = User_weight*Item_mean[i]
    output_data[output_data>=0.56]=1
    output_data[output_data<0.56]=0
    student_data_sample.loc[student_data_sample[i].isnull(),i]=output_data[student_data_sample[i].isnull()]
    predicted_labels = output_data[student_data_matrix.loc[:,i].notnull()]
    actual_labels = student_data_matrix.loc[student_data_matrix.loc[:,i].notnull(),i]
    diff=predicted_labels-actual_labels
    misclass = misclass + len(diff[diff!=0])
    total_val = total_val + len(actual_labels)

##### Accuracy on how well the information is captured through Relative Ranking estimate

In [82]:
accuracy_rel_ability = 1-(float(misclass)/float(total_val))
accuracy_rel_ability

0.7516631986166877

## Imputation (Collaborative Filtering + Relative Ability estimate)

### Collaborative Filtering

In [83]:
user_item_matrix = student_data.pivot_table(index = "question_id", columns = "user_id",values="correct")
user_item_matrix.head(n=5)

user_id,34,324,346,350,363,364,372,377,378,379,...,118131,118133,118135,118139,118140,118144,118156,118165,118255,118275
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13,,,,,,,,,,,...,,,,,,,,,,
21,,,,,,,,,,,...,0.0,,,,,,,,,
26,,,,1.0,,,,,,,...,,,,,,,,,,
27,,1.0,,0.0,,,,,0.0,,...,,,,,,,,,,
28,,1.0,,,,,,,,1.0,...,,,,,,,,,,


In [84]:
corr_matrix=user_item_matrix.corr(method="pearson",min_periods=13)

In [85]:
for col in user_item_matrix.columns:
    sim_user = corr_matrix.loc[:,col]
    sim_user = sim_user[sim_user.notnull()].sort_values(ascending=False)
    sim_user = sim_user[sim_user>0.8].index
    peer_response = user_item_matrix.loc[user_item_matrix.loc[:,col].isnull(),sim_user]
    peer_response = peer_response.dropna(axis=0,how="all")
    predicted_labels = peer_response.sum(axis=1)/(sim_user.shape[0]-1)
    predicted_labels[predicted_labels>=0.5] = 1
    predicted_labels[predicted_labels<0.5] = 0
    user_item_matrix.loc[predicted_labels.index,col] = predicted_labels

In [86]:
student_data_matrix = user_item_matrix.transpose()
student_data_matrix.head(n=5)

question_id,13,21,26,27,28,37,44,45,53,54,...,15065,15071,15142,15147,15403,15412,15413,15415,15865,15872
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
324,1.0,,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,,,,,,,,,,
346,,,,,,,,,,,...,,,,,,,,,,
350,,,1.0,0.0,,,,,1.0,1.0,...,,,,,,,,,,
363,,,,,0.0,0.0,,0.0,,,...,,,,,,,,,,


### Relative Ability estimate followed by Collabirative Filtering

#### User Mean calculation

In [87]:
User_total_score = student_data_matrix.sum(axis=1,skipna=True)
User_nresp = student_data_matrix.count(axis=1)
User_mean = User_total_score/User_nresp
User_mean.head(n=5)

user_id
34     0.152542
324    0.451613
346    0.222222
350    0.518987
363    0.421687
dtype: float64

#### Item Mean Calculation

In [88]:
Item_nresp = student_data_matrix.count(axis=0)
Item_score = student_data_matrix.sum(axis=0,skipna=True)
Item_mean = Item_score / Item_nresp
Item_mean.head(n=5)

question_id
13    0.251149
21    0.122569
26    0.214137
27    0.280702
28    0.303502
dtype: float64

#### User Relative Ability

In [89]:
def colRelative(x):
    ser = x.notnull()
    col = student_data_matrix.columns
    Item_names = col[ser]
    Obs_mean = sum(Item_mean[Item_names])/len(Item_mean[Item_names])
    return Obs_mean

In [90]:
relative_perf = student_data_matrix.apply(colRelative,axis=1)
relative_perf.head(n=5)

user_id
34     0.270593
324    0.267377
346    0.274546
350    0.299330
363    0.283552
dtype: float64

In [91]:
User_weight=User_mean/relative_perf
User_weight.head(n=5)

user_id
34     0.563734
324    1.689049
346    0.809417
350    1.733828
363    1.487161
dtype: float64

#### Imputation Calculation

In [92]:
student_data_sample = student_data_matrix.copy()
student_data_sample.head(n=5)

question_id,13,21,26,27,28,37,44,45,53,54,...,15065,15071,15142,15147,15403,15412,15413,15415,15865,15872
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
324,1.0,,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,,,,,,,,,,
346,,,,,,,,,,,...,,,,,,,,,,
350,,,1.0,0.0,,,,,1.0,1.0,...,,,,,,,,,,
363,,,,,0.0,0.0,,0.0,,,...,,,,,,,,,,


In [93]:
misclass = 0
total_val = 0
for i in student_data_sample.columns:
    output_data = User_weight*Item_mean[i]
    output_data[output_data>=0.56]=1
    output_data[output_data<0.56]=0
    student_data_sample.loc[student_data_sample[i].isnull(),i]=output_data[student_data_sample[i].isnull()]
    predicted_labels = output_data[student_data_matrix.loc[:,i].notnull()]
    actual_labels = student_data_matrix.loc[student_data_matrix.loc[:,i].notnull(),i]
    diff=predicted_labels-actual_labels
    misclass = misclass + len(diff[diff!=0])
    total_val = total_val + len(actual_labels)
    

##### Accuracy on how well the information is captured through  Collaborative Filtering and Relative Ranking estimate combined

In [94]:
accuracy_collab_relative = 1-(float(misclass)/float(total_val))
accuracy_collab_relative

0.7867545977781609

### Accuracy Comparison between the two methods of Imputation

In [100]:
print "Accuracy(RelAbility Estimate):" + str(accuracy_rel_ability)
print "Accuracy(Collaborative + RelAbility Estimate):" + str(accuracy_collab_relative)

Accuracy(RelAbility Estimate):0.751663198617
Accuracy(Collaborative + RelAbility Estimate):0.786754597778


### Item Discrimination Estimate

In [101]:
predicted_test_result = student_data_sample.sum(axis=1)
predicted_test_result.head(n=5)

user_id
34      45.0
324    120.0
346      4.0
350    111.0
363     66.0
dtype: float64

In [102]:
def findCorr(x):
    return x.corr(predicted_test_result)

In [103]:
Item_discrm = student_data_sample.apply(findCorr,axis=0)
Item_sort_list = Item_discrm.sort_values()
Item_sort_list.head(n=5)

question_id
1232    -0.114625
5566    -0.001442
12717    0.000063
692      0.001211
12665    0.003389
dtype: float64

In [104]:
Item_discrm.to_csv("Discrm_Coll_RelAbility.csv")