In [1]:
from transformers import BertTokenizer, BertModel
import torch
from sentence_transformers import SentenceTransformer

import pandas as pd
from bs4 import BeautifulSoup
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
import scipy.stats
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def get_course_embedding(course_description):
    inputs = tokenizer(course_description, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)  


In [3]:
course_df = pd.read_json("../data/MOOCCube/entities/course.json",lines=True)
# course_df = course_df.sample(100)
def remove_html_and_carriage_returns(text):
    soup = BeautifulSoup(text, "html.parser")
    text_without_html = soup.get_text().replace("\n","").replace("\r","").replace("\t","")
    
    return text_without_html

course_df["about"] = course_df["about"].apply(remove_html_and_carriage_returns)
course_df.head()

Unnamed: 0,id,name,prerequisites,about,core_id,video_order,display_name,chapter
0,C_course-v1:McGillX+ATOC185x+2015_T1,自然灾害（自主模式）,无,地球上没有一处地方不发生自然灾害。当我们以科学的眼光看待这些自然灾害的原因和本质时，我们可以...,C_course-v1:McGillX+ATOC185x+2015_T1,"[V_f6f710068b994452885b90e11b6ee5c5, V_7339568...","[Video: Overview 1, Video: Overview 2, Video: ...","[01.02.01.02, 01.02.03.02, 01.02.05.02, 01.02...."
1,C_course-v1:TsinghuaX+THESIS2015X+2015_T1,2015年清华大学研究生学位论文答辩（二）,无先修要求,学位论文答辩环节是研究生培养的重要环节，为了充分发挥该环节的育人作用，搭建学术交流的平台，进...,C_course-v1:TsinghuaX+THESIS2015X+2015_T1,"[V_de0371575a9f4b5391c89ad16d68b5c2, V_d632034...","[答辩陈述, 答辩陈述, 问答及答辩结果, 答辩陈述, 问答及答辩结果, 答辩陈述, 问答及...","[01.01.03.01, 01.02.03.01, 01.02.04.01, 01.03...."
2,C_course-v1:TsinghuaX+THESIS2014_1X_tv+_2014_,2014年清华大学研究生学位论文答辩（一）,无先修要求,学位论文答辩环节是研究生培养的重要环节，为了充分发挥该环节的育人作用，搭建学术交流的平台，进...,C_course-v1:TsinghuaX+THESIS2014_1X_tv+_2014_,"[V_d530be9cc0584317a16706684577a6dd, V_f329a62...","[论文答辩实况, 问答及答辩结果, 导师评价, 同学眼中的王鑫, 个人学术感言, 吴宇恩答辩...","[01.01.03.01, 01.01.04.01, 01.01.05.01, 01.01...."
3,C_course-v1:TsinghuaX+THESIS2015X_tv+2015_T1,2015年清华大学研究生学位论文答辩（一）,无先修要求,学位论文答辩环节是研究生培养的重要环节，为了充分发挥该环节的育人作用，搭建学术交流的平台，进...,C_course-v1:TsinghuaX+THESIS2015X+2015_T1,"[V_de0371575a9f4b5391c89ad16d68b5c2, V_78a8b41...","[答辩陈述, 问答及答辩结果, 答辩陈述, 问答及答辩结果, 答辩陈述, 问答及答辩结果, ...","[01.01.03.01, 01.01.04.01, 01.02.03.01, 01.02...."
4,C_course-v1:TsinghuaX+00690242+sp,文物精品与文化中国（自主模式）,无,中国考古学是以往100年中发展最为迅速的领域之一，大批珍贵文物的出土，不断刷新人们对文化中国...,C_course-v1:TsinghuaX+00690242+sp,"[V_d7dbd0fe8f504e7a91d863cd5a19b185, V_4492eca...","[文献所见原始渡河工具, 《禹贡》所见的水路交通, 绰墩山出土的渡河浮木, 舟船考古发掘, ...","[01.01.01.01, 01.01.02.01, 01.02.01.01, 01.03...."


In [4]:
course_embeddings = {}
for index, row in course_df.iterrows():
    id = row["id"]
    course_embedding = get_course_embedding(row["about"])
    course_embeddings[id] = course_embedding

In [5]:
def get_course_similarity(course_embedding1, course_embedding2):
    return cosine_similarity(course_embedding1, course_embedding2)

In [6]:
course_similarity_matrix = {}
for course_id1, embedding1 in course_embeddings.items():
    course_similarity_row = {}
    for course_id2, embedding2 in course_embeddings.items():
        similarity = get_course_similarity(embedding1, embedding2)
        course_similarity_row[course_id2] = similarity
    course_similarity_matrix[course_id1] = course_similarity_row

In [7]:
user_df = pd.read_json("../data/MOOCCube/entities/user.json",lines=True)
user_df = user_df[user_df['course_order'].apply(lambda x: len(x) >= 5)]
user = []
data_train = []
data_test = []
for index, row in user_df.iterrows():
    user_id = row["id"]
    courses = row["course_order"]

    index_partition = int(len(courses) * 0.75)

    courses_1 = courses[:index_partition]
    courses_2 = courses[index_partition:]
        
    for course in courses_1:
        data_train.append({
                "user_id":user_id,
                "course_id":course,
                "view":1
            })
            
    for course in courses_2:
            data_test.append({
                "user_id":user_id,
                "course_id":course,
                "view":1
            })

In [8]:
df_train = pd.DataFrame(data_train)
df_test = pd.DataFrame(data_test)

In [9]:
matrix = df_train.pivot_table(index='user_id', columns='course_id', values='view')
matrix = matrix.fillna(0)
matrix.head()

course_id,C_course-v1:ACCA+FA1_X+2019_T1,C_course-v1:ACCA+FA1_X+sp,C_course-v1:ACCA+FA1_X_en+2019_T1,C_course-v1:ACCA+FA2_X+2019_T1,C_course-v1:ACCA+FA2_X+sp,C_course-v1:ACCA+FA2_X_en+2019_T1,C_course-v1:ACCA+MA1_X+2019_T1,C_course-v1:ACCA+MA1_X+sp,C_course-v1:ACCA+MA1_X_en+2019_T1,C_course-v1:ACCA+MA2_X+2019_T1,...,C_course-v1:nxu+2018122711+2019_T1,C_course-v1:nxu+2018122712+2019_T1,C_course-v1:nxu+2018122713+2019_T1,C_course-v1:qdu+2018122608X+2018_T2,C_course-v1:qhnu+20181212x+2018_T2,C_course-v1:qhnu+20181212x+2019_T1,C_course-v1:rcoe+mooc103+2019_T1,C_course-v1:shsmu+shsmu001+2019_T1,C_course-v1:snnu+20180920X+2019_T1,C_course-v1:xuetangX+MOOC102+2019_T1
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U_10000144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_10000168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_10000185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_10000205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U_10000338,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
user_similarity = cosine_similarity(matrix)

In [11]:
user_similarity_df = pd.DataFrame(user_similarity, index=matrix.index, columns=matrix.index)

In [12]:
groupedTrain = df_train.groupby('user_id')['course_id'].apply(list).reset_index()
groupedTest = df_test.groupby('user_id')['course_id'].apply(list).reset_index()
def getUserCourses(user,df_user):
    return df_user.loc[df_user['id'] == user, 'course_order'].values[0]
def  getUserCoursesTest(user):
    return groupedTest.loc[groupedTest['user_id'] == user, 'course_id'].values[0]
def  getUserCoursesTrain(user):
    return groupedTrain.loc[groupedTrain['user_id'] == user, 'course_id'].values[0]

In [13]:
cont = 0
def recommend_courses(user, num_recommendations=15,alfa=0.1):
    user_taken_courses = set(getUserCoursesTrain(user))
    
    user_took_embeddings = [course_embeddings[course_id] for course_id in user_taken_courses]
    user_mean_embedding = torch.stack(user_took_embeddings).mean(dim=0)
    
    course_scores = {}
    for course_id, course_embedding in course_embeddings.items():
        if course_id not in user_taken_courses:
            similarity = get_course_similarity(user_mean_embedding, course_embedding)
            course_scores[course_id] = similarity
    
    sorted_courses = sorted(course_scores.items(), key=lambda x: x[1], reverse=True)
    recommended_courses_scores_content = [(course_id,score[0][0]) for course_id, score in sorted_courses[:num_recommendations]]
    
    
    user_taken_courses = getUserCoursesTrain(user)
    user_predict_courses = getUserCoursesTest(user)
    similar_users_to_user = user_similarity_df.loc[user].sort_values(ascending=False)[1:10]
    similar_users_to_user_df = pd.DataFrame(similar_users_to_user)
    similar_users_ids = []
    for index, row in similar_users_to_user_df.iterrows():
        similar_users_ids.append({"user_id":index,"relevance":row[user]})
    recomendations = {}
    for similar_user in similar_users_ids:
        courses_taken_similar = getUserCourses(similar_user["user_id"],user_df)
        not_taken_user_courses = list(set(courses_taken_similar)-set(user_taken_courses))
        for not_taken_user_course in courses_taken_similar:
            num = recomendations.get(not_taken_user_course,0)
            recomendations[not_taken_user_course] = num + similar_user["relevance"]
    for i in recomendations:
        recomendations[i] = recomendations[i]/len(similar_users_ids)
    recomendations_ordered_scores_collaboratory = sorted(recomendations.items(), key=lambda x: x[1], reverse=True)[:num_recommendations]
    final_scores = {}
    
    
    for i in recommended_courses_scores_content:
        score = final_scores.get(i[0],0)
        final_scores[i[0]] = score + (i[1]*alfa)
    for i in recomendations_ordered_scores_collaboratory:
        score = final_scores.get(i[0],0)
        final_scores[i[0]] = score + (i[1]*(1-alfa))
    
    final_recomendations = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)

    recomendations_top = [recommendation[0] for recommendation in final_recomendations[:num_recommendations]]
    cont += 1
    print(cont)
    return recomendations_top

recommended_courses = recommend_courses("U_10000205")
# print(len(recommended_courses))

In [14]:
def evaluate(df_user,alfa):
    results = []
    for index, row in df_user.iterrows():
        courses_test = getUserCoursesTest(row["id"])
        recommended = recommend_courses(row["id"])
        matches = list(set(courses_test).intersection(set(recommended)))
        results.append(
            {
                "recomendations":len(recommended),
                "matches":len(matches),
                "posible":len(courses_test)
            }
        )
    return results  

In [16]:
results_01 = evaluate(user_df,alfa=0.1)
df_evaluations = pd.DataFrame(results_01)
df_evaluations.to_csv('hybryd_01.csv', index=False)
print("1")

1
