In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
user_df = pd.read_json("../data/MOOCCube/entities/user.json",lines=True)
user_df = user_df[user_df['course_order'].apply(lambda x: len(x) >= 5)]
data_train = []
data_test = []
for index, row in user_df.iterrows():
    user_id = row["id"]
    courses = row["course_order"]

    index_partition = int(len(courses) * 0.75)

    courses_1 = courses[:index_partition]
    courses_2 = courses[index_partition:]
        
    for course in courses_1:
        data_train.append({
                "user_id":user_id,
                "course_id":course,
                "view":1
            })
            
    for course in courses_2:
            data_test.append({
                "user_id":user_id,
                "course_id":course,
                "view":1
            })

In [3]:
user_df.shape

(34917, 4)

In [4]:
df_train = pd.DataFrame(data_train)
df_test = pd.DataFrame(data_test)

In [5]:
df_train.shape

(187699, 3)

In [6]:
matrix = df_train.pivot_table(index='user_id', columns='course_id', values='view')
matrix.head()

course_id,C_course-v1:ACCA+FA1_X+2019_T1,C_course-v1:ACCA+FA1_X+sp,C_course-v1:ACCA+FA1_X_en+2019_T1,C_course-v1:ACCA+FA2_X+2019_T1,C_course-v1:ACCA+FA2_X+sp,C_course-v1:ACCA+FA2_X_en+2019_T1,C_course-v1:ACCA+MA1_X+2019_T1,C_course-v1:ACCA+MA1_X+sp,C_course-v1:ACCA+MA1_X_en+2019_T1,C_course-v1:ACCA+MA2_X+2019_T1,...,C_course-v1:nxu+2018122711+2019_T1,C_course-v1:nxu+2018122712+2019_T1,C_course-v1:nxu+2018122713+2019_T1,C_course-v1:qdu+2018122608X+2018_T2,C_course-v1:qhnu+20181212x+2018_T2,C_course-v1:qhnu+20181212x+2019_T1,C_course-v1:rcoe+mooc103+2019_T1,C_course-v1:shsmu+shsmu001+2019_T1,C_course-v1:snnu+20180920X+2019_T1,C_course-v1:xuetangX+MOOC102+2019_T1
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U_10000144,,,,,,,,,,,...,,,,,,,,,,
U_10000168,,,,,,,,,,,...,,,,,,,,,,
U_10000185,,,,,,,,,,,...,,,,,,,,,,
U_10000205,,,,,,,,,,,...,,,,,,,,,,
U_10000338,,,,,,,,,,,...,,,,,,,,,,


In [7]:
matrix = matrix.fillna(0)
matrix.head()

course_id,C_course-v1:ACCA+FA1_X+2019_T1,C_course-v1:ACCA+FA1_X+sp,C_course-v1:ACCA+FA1_X_en+2019_T1,C_course-v1:ACCA+FA2_X+2019_T1,C_course-v1:ACCA+FA2_X+sp,C_course-v1:ACCA+FA2_X_en+2019_T1,C_course-v1:ACCA+MA1_X+2019_T1,C_course-v1:ACCA+MA1_X+sp,C_course-v1:ACCA+MA1_X_en+2019_T1,C_course-v1:ACCA+MA2_X+2019_T1,...,C_course-v1:nxu+2018122711+2019_T1,C_course-v1:nxu+2018122712+2019_T1,C_course-v1:nxu+2018122713+2019_T1,C_course-v1:qdu+2018122608X+2018_T2,C_course-v1:qhnu+20181212x+2018_T2,C_course-v1:qhnu+20181212x+2019_T1,C_course-v1:rcoe+mooc103+2019_T1,C_course-v1:shsmu+shsmu001+2019_T1,C_course-v1:snnu+20180920X+2019_T1,C_course-v1:xuetangX+MOOC102+2019_T1
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U_10000144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_10000168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_10000185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_10000205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_10000338,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
user_correlation = matrix.T.corr(method='pearson', min_periods=1)

In [9]:
similar_users = user_correlation["U_10000205"].sort_values(ascending=False)[1:11]
df = pd.DataFrame(similar_users)

In [10]:
df.head()
for index, row in df.iterrows():
    print(row["U_10000205"])
    print(index)


0.5441323864469196
U_1966725
0.5441323864469194
U_2013352
0.4956268221574349
U_8475699
0.49562682215743475
U_8863612
0.4956268221574345
U_471472
0.4683200786081943
U_9060969
0.4683200786081943
U_6713644
0.4683200786081943
U_7661694
0.4683200786081943
U_5824376
0.4683200786081943
U_9958819


In [545]:
groupedTrain = df_train.groupby('user_id')['course_id'].apply(list).reset_index()
groupedTrain.head()
groupedTest = df_test.groupby('user_id')['course_id'].apply(list).reset_index()
groupedTest.head()

Unnamed: 0,user_id,course_id
0,U_10000144,"[C_course-v1:TsinghuaX+00612642X+sp, C_course-..."
1,U_10000168,"[C_course-v1:TsinghuaX+10430494X_2015_2+sp, C_..."
2,U_10000185,"[C_course-v1:TsinghuaX+30700313X+sp, C_course-..."
3,U_10000205,"[C_course-v1:TsinghuaX+30240184+sp, C_course-v..."
4,U_10000338,"[C_course-v1:TsinghuaX+30640014X+sp, C_course-..."


In [546]:
def getUserCourses(user,df_user):
    return df_user.loc[df_user['id'] == user, 'course_order'].values[0]
def  getUserCoursesTest(user,df_user):
    return df_user.loc[df_user['user_id'] == user, 'course_id'].values[0]
def  getUserCoursesTrain(user,df_user):
    return df_user.loc[df_user['user_id'] == user, 'course_id'].values[0]

In [547]:
print(getUserCourses("U_10000205",user_df))
print(getUserCoursesTest("U_10000205",groupedTest))

['C_course-v1:TsinghuaX+70150023X+sp', 'C_course-v1:NCTU+wghx+2017_T1', 'C_course-v1:TsinghuaX+70660542X+2015_T2', 'C_course-v1:GZHU+20180718001+2019_T1', 'C_course-v1:TsinghuaX+30240243X+sp', 'C_course-v1:TsinghuaX+30150153X+sp', 'C_course-v1:TsinghuaX+30240184+sp', 'C_course-v1:TsinghuaX+60240013X+sp', 'C_course-v1:UST+UST001+sp']
['C_course-v1:TsinghuaX+30240184+sp', 'C_course-v1:TsinghuaX+60240013X+sp', 'C_course-v1:UST+UST001+sp']


In [680]:

def recommend(user,correlation_users,num_recomendations=10):
    user_taken_courses = getUserCoursesTrain(user,groupedTrain)
    user_predict_courses = getUserCoursesTest(user,groupedTest)
    similar_users_to_user = correlation_users[user].sort_values(ascending=False)[1:10]
    similar_users_to_user_df = pd.DataFrame(similar_users_to_user)
    similar_users_ids = []
    for index, row in similar_users_to_user_df.iterrows():
        similar_users_ids.append({"user_id":index,"relevance":row[user]})
    recomendations = {}
    for similar_user in similar_users_ids:
        courses_taken_similar = getUserCourses(similar_user["user_id"],user_df)
        not_taken_user_courses = list(set(courses_taken_similar)-set(user_taken_courses))
        for not_taken_user_course in courses_taken_similar:
            num = recomendations.get(not_taken_user_course,0)
            recomendations[not_taken_user_course] = num + similar_user["relevance"]
    for i in recomendations:
        # print(recomendations[i])
        recomendations[i] = recomendations[i]/len(similar_users_ids)
    
    recomendations_ordered = sorted(recomendations.items(), key=lambda x: x[1], reverse=True)
    # print(recomendations_ordered)
    recomendations_ordered = [recomendation[0] for recomendation in recomendations_ordered]
    return recomendations_ordered[:num_recomendations]

In [681]:
def evaluate(df_user):
    results = []
    for index, row in df_user.iterrows():
        courses_test = getUserCoursesTest(row["id"],groupedTest)
        recommended = recommend(row["id"],user_correlation)
        matches = list(set(courses_test).intersection(set(recommended)))
        results.append(
            {
                "recomendations":len(recommended),
                "matches":len(matches),
                "posible":len(courses_test)
            }
        )
        print(index)
    return results        

In [682]:
results = evaluate(user_df)

0
3
4
7
13
14
19
31
35
38
48
49
52
55
57
64
66
75
86
90
102
119
126
131
140
141
148
157
185
192
203
207
211
226
233
234
245
258
260
263
272
274
282
284
286
289
297
299
308
315
317
327
328
331
333
357
360
364
367
373
375
385
391
404
405
415
422
425
427
430
433
437
450
458
462
497
499
531
533
534
536
537
540
542
545
555
556
560
568
569
609
610
612
618
622
623
628
642
647
649
652
657
658
659
664
667
668
672
678
685
686
693
697
700
710
714
720
734
735
739
743
745
758
761
767
776
778
790
791
792
797
801
807
809
810
816
817
819
824
830
832
837
838
839
852
854
859
864
870
877
878
880
886
897
899
904
906
908
909
911
912
918
928
931
945
946
951
957
965
968
978
981
993
995
996
999
1003
1023
1028
1037
1042
1045
1048
1049
1051
1057
1060
1063
1065
1073
1081
1089
1090
1119
1120
1122
1134
1152
1156
1157
1158
1159
1162
1168
1182
1186
1193
1194
1196
1202
1203
1205
1209
1212
1220
1224
1230
1232
1235
1238
1242
1248
1250
1252
1262
1265
1269
1287
1290
1294
1295
1302
1320
1321
1326
1327
1332
1336
1344
1346


In [None]:
df_evaluations = pd.DataFrame(results)
df_evaluations.to_csv('cf.csv', index=False)

In [None]:
df_evaluations.head()