In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
user_df = pd.read_json("../data/MOOCCube/entities/user.json",lines=True)
data_train = []
data_test = []
for index, row in user_df.iterrows():
    user_id = row["id"]
    courses = row["course_order"]
    if len(courses) > 5:

        index_partition = int(len(courses) * 0.75)

        courses_1 = courses[:index_partition]
        courses_2 = courses[index_partition:]
        
        for course in courses_1:
            data_train.append({
                "user_id":user_id,
                "course_id":course,
                "view":1
            })
            
        for course in courses_2:
            data_test.append({
                "user_id":user_id,
                "course_id":course,
                "view":1
            })

In [3]:
df_train = pd.DataFrame(data_train)
df_test = pd.DataFrame(data_test)

In [4]:
df_train.shape

(153049, 3)

In [5]:
matrix = df_train.pivot_table(index='user_id', columns='course_id', values='view')
matrix.head()

course_id,C_course-v1:ACCA+FA1_X+2019_T1,C_course-v1:ACCA+FA1_X+sp,C_course-v1:ACCA+FA1_X_en+2019_T1,C_course-v1:ACCA+FA2_X+2019_T1,C_course-v1:ACCA+FA2_X+sp,C_course-v1:ACCA+FA2_X_en+2019_T1,C_course-v1:ACCA+MA1_X+2019_T1,C_course-v1:ACCA+MA1_X+sp,C_course-v1:ACCA+MA1_X_en+2019_T1,C_course-v1:ACCA+MA2_X+2019_T1,...,C_course-v1:nxu+2018122711+2019_T1,C_course-v1:nxu+2018122712+2019_T1,C_course-v1:nxu+2018122713+2019_T1,C_course-v1:qdu+2018122608X+2018_T2,C_course-v1:qhnu+20181212x+2018_T2,C_course-v1:qhnu+20181212x+2019_T1,C_course-v1:rcoe+mooc103+2019_T1,C_course-v1:shsmu+shsmu001+2019_T1,C_course-v1:snnu+20180920X+2019_T1,C_course-v1:xuetangX+MOOC102+2019_T1
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U_10000168,,,,,,,,,,,...,,,,,,,,,,
U_10000205,,,,,,,,,,,...,,,,,,,,,,
U_10000338,,,,,,,,,,,...,,,,,,,,,,
U_10000415,,,,,,,,,,,...,,,,,,,,,,
U_10000491,,,,,,,,,,,...,,,,,,,,,,


In [6]:
matrix = matrix.fillna(0)
matrix.head()

course_id,C_course-v1:ACCA+FA1_X+2019_T1,C_course-v1:ACCA+FA1_X+sp,C_course-v1:ACCA+FA1_X_en+2019_T1,C_course-v1:ACCA+FA2_X+2019_T1,C_course-v1:ACCA+FA2_X+sp,C_course-v1:ACCA+FA2_X_en+2019_T1,C_course-v1:ACCA+MA1_X+2019_T1,C_course-v1:ACCA+MA1_X+sp,C_course-v1:ACCA+MA1_X_en+2019_T1,C_course-v1:ACCA+MA2_X+2019_T1,...,C_course-v1:nxu+2018122711+2019_T1,C_course-v1:nxu+2018122712+2019_T1,C_course-v1:nxu+2018122713+2019_T1,C_course-v1:qdu+2018122608X+2018_T2,C_course-v1:qhnu+20181212x+2018_T2,C_course-v1:qhnu+20181212x+2019_T1,C_course-v1:rcoe+mooc103+2019_T1,C_course-v1:shsmu+shsmu001+2019_T1,C_course-v1:snnu+20180920X+2019_T1,C_course-v1:xuetangX+MOOC102+2019_T1
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U_10000168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_10000205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_10000338,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_10000415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_10000491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
user_correlation = matrix.T.corr(method='pearson', min_periods=1)

In [8]:
similar_users = user_correlation["U_10000205"].sort_values(ascending=False)[1:11]
df = pd.DataFrame(similar_users)

In [9]:
df.head()
for index, row in df.iterrows():
    print(row["U_10000205"])
    print(index)


0.5441271493535857
U_1966725
0.5441271493535855
U_2013352
0.495620437956205
U_8475699
0.4956204379562047
U_8863612
0.49562043795620453
U_471472
0.4273156230155508
U_1018007
0.4040829209128268
U_36008
0.40408292091282677
U_8550243
0.40408292091282677
U_2576601
0.40408292091282677
U_22772


In [10]:
user_df.head(

)

Unnamed: 0,id,name,course_order,enroll_time
0,U_7001215,李喜锋,"[C_course-v1:TsinghuaX+00740043_2x_2015_T2+sp,...","[2017-05-01 11:07:53, 2017-05-17 10:07:17, 201..."
1,U_10402446,五元香,"[C_course-v1:TsinghuaX+00510888X+2019_T1, C_co...","[2019-06-14 08:50:04, 2019-01-04 20:36:07]"
2,U_10359065,魏珊,"[C_course-v1:TsinghuaX+00612642X+sp, C_course-...","[2019-01-18 21:19:56, 2019-01-14 21:54:54]"
3,U_7423998,郭海滨,"[C_course-v1:TsinghuaX+30240184_2X+sp, C_cours...","[2017-08-16 10:38:11, 2018-07-01 18:24:24, 201..."
4,U_545306,李其艳,"[C_course-v1:TsinghuaX+20430064_2X+sp, C_cours...","[2018-09-05 15:40:40, 2019-02-28 10:08:49, 201..."


In [62]:
def make_recomendation(user,dfs,matrix,num_rec=10):
    similar_users = matrix[user].sort_values(ascending=False)[1:20]
    df = pd.DataFrame(similar_users)
    # print(df.head())
    similar_users_ids = []
    for index, row in df.iterrows():
        similar_users_ids.append({"user_id":index,"relevance":row[user]})
    courses_taken=dfs.loc[dfs['id'] == user, 'course_order'].iloc[0]
    recomendation = {}
    for user_id in similar_users_ids:
        result = dfs.loc[dfs['id'] == user_id["user_id"], 'course_order'].iloc[0]
        set1 = set(courses_taken)
        set2 = set(result)
        courses_not_taken = set1-set2
        for j in courses_not_taken:
            cant = recomendation.get(j,0)
            recomendation[j]=cant+user_id["relevance"]
    final = []
    for i in recomendation:
        final.append({
            "course_id":i,
            "quantity":recomendation[i]/20
        })
    sorted_data = sorted(final, key=lambda x: x['quantity'])[::-1]
    return sorted_data[:num_rec]
        
make_recomendation("U_10000205",user_df,user_correlation)

[{'course_id': 'C_course-v1:TsinghuaX+60240013X+sp',
  'quantity': 0.41277546037290413},
 {'course_id': 'C_course-v1:GZHU+20180718001+2019_T1',
  'quantity': 0.41277546037290413},
 {'course_id': 'C_course-v1:UST+UST001+sp', 'quantity': 0.41277546037290413},
 {'course_id': 'C_course-v1:NCTU+wghx+2017_T1',
  'quantity': 0.38556910290522495},
 {'course_id': 'C_course-v1:TsinghuaX+30240184+sp',
  'quantity': 0.351001387130844},
 {'course_id': 'C_course-v1:TsinghuaX+70660542X+2015_T2',
  'quantity': 0.31795445334626293},
 {'course_id': 'C_course-v1:TsinghuaX+30240243X+sp',
  'quantity': 0.22924781792409266},
 {'course_id': 'C_course-v1:TsinghuaX+30150153X+sp',
  'quantity': 0.08781879560460328},
 {'course_id': 'C_course-v1:TsinghuaX+70150023X+sp',
  'quantity': 0.04741050351332062}]

In [67]:
grouped = df_test.groupby('user_id')['course_id'].apply(list).reset_index()
user_course_list = grouped.to_dict(orient='records')
# print(user_course_list[12])

def evaluate():
    cont = 0
    results = []
    for row in user_course_list:
        recomendations = make_recomendation(row["user_id"],user_df,user_correlation,num_rec=len(row["course_id"]))
        recomended_courses = [d["course_id"] for d in recomendations]
        set1 = set(row["course_id"])
        set2 = set(recomended_courses)
        results.append({
            "user_id":row["user_id"],
            "num_recomendations":len(row["course_id"]),
            "evaluate":len(row["course_id"]),
            "matches":len(set1.intersection(set2))
        })
        
        cont = 1 + cont
        print(cont)
    return results  

    
evaluate()

{'user_id': 'U_10000168', 'num_recomendations': 2, 'evaluate': 2, 'matches': 2}
1
{'user_id': 'U_10000205', 'num_recomendations': 3, 'evaluate': 3, 'matches': 2}
2
{'user_id': 'U_10000338', 'num_recomendations': 2, 'evaluate': 2, 'matches': 1}
3
{'user_id': 'U_10000415', 'num_recomendations': 2, 'evaluate': 2, 'matches': 2}
4
{'user_id': 'U_10000491', 'num_recomendations': 2, 'evaluate': 2, 'matches': 1}
5
{'user_id': 'U_1000063', 'num_recomendations': 2, 'evaluate': 2, 'matches': 2}
6
{'user_id': 'U_10000658', 'num_recomendations': 2, 'evaluate': 2, 'matches': 2}
7
{'user_id': 'U_10001302', 'num_recomendations': 2, 'evaluate': 2, 'matches': 1}
8
{'user_id': 'U_10001348', 'num_recomendations': 3, 'evaluate': 3, 'matches': 2}
9
{'user_id': 'U_1000149', 'num_recomendations': 3, 'evaluate': 3, 'matches': 2}
10
{'user_id': 'U_10001784', 'num_recomendations': 2, 'evaluate': 2, 'matches': 1}
11
{'user_id': 'U_10002175', 'num_recomendations': 2, 'evaluate': 2, 'matches': 1}
12
{'user_id': 'U_