In [87]:
from transformers import BertTokenizer, BertModel
import torch
from sentence_transformers import SentenceTransformer

import pandas as pd
from bs4 import BeautifulSoup
from sklearn.metrics.pairwise import cosine_similarity

In [88]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def get_course_embedding(course_description):
    inputs = tokenizer(course_description, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)  



In [89]:
course_df = pd.read_json("../data/MOOCCube/entities/course.json",lines=True)
# course_df = course_df.sample(100)
def remove_html_and_carriage_returns(text):
    soup = BeautifulSoup(text, "html.parser")
    text_without_html = soup.get_text().replace("\n","").replace("\r","").replace("\t","")
    
    return text_without_html

course_df["about"] = course_df["about"].apply(remove_html_and_carriage_returns)

In [90]:
course_embeddings = {}
for index, row in course_df.iterrows():
    id = row["id"]
    course_embedding = get_course_embedding(row["about"])
    course_embeddings[id] = course_embedding

In [91]:
def get_course_similarity(course_embedding1, course_embedding2):
    return cosine_similarity(course_embedding1, course_embedding2)

In [92]:
course_similarity_matrix = {}
for course_id1, embedding1 in course_embeddings.items():
    course_similarity_row = {}
    for course_id2, embedding2 in course_embeddings.items():
        similarity = get_course_similarity(embedding1, embedding2)
        course_similarity_row[course_id2] = similarity
    course_similarity_matrix[course_id1] = course_similarity_row


In [93]:
def recommend_courses(user_took_courses, num_recommendations=10):
    user_took_embeddings = [course_embeddings[course_id] for course_id in user_took_courses]
    user_mean_embedding = torch.stack(user_took_embeddings).mean(dim=0)
    
    course_scores = {}
    for course_id, course_embedding in course_embeddings.items():
        if course_id not in user_took_courses:
            similarity = get_course_similarity(user_mean_embedding, course_embedding)
            course_scores[course_id] = similarity
    
    sorted_courses = sorted(course_scores.items(), key=lambda x: x[1], reverse=True)
    recommended_courses = [course_id for course_id, _ in sorted_courses[:num_recommendations]]
    return recommended_courses

user_took_courses = { 'C_course-v1:TsinghuaX+02070251X+2019_T1', 'C_course-v1:TsinghuaX+60240013X+sp'} 
recommended_courses = recommend_courses(user_took_courses)


# print("Recomendaciones:", recommended_courses)


In [94]:
print(recommended_courses)

['C_course-v1:TsinghuaX+02070251X+sp', 'C_course-v1:TsinghuaX+20430054X1+sp', 'C_course-v1:TsinghuaX+20430054X_1+sp', 'C_course-v1:KMUSTX+8209001+sp', 'C_course-v1:TsinghuaX+AP201601X+2019_T1', 'C_course-v1:edX+BlendedX+sp', 'C_course-v1:RiceX+AdvENVSCI_1x+sp', 'C_course-v1:McGillX+ATOC185x+2015_T1', 'C_course-v1:KMUSTX+8209001+2019_T1', 'C_course-v1:TsinghuaX+34000312X+2019_T1']


Evaluation

In [101]:
def get_test_courses(x):
    index_partition = int(len(x) * 0.75)
    return x[index_partition:]
def get_train_courses(x):
    index_partition = int(len(x) * 0.75)
    return x[:index_partition]

In [97]:
user_df = pd.read_json("../data/MOOCCube/entities/user.json",lines=True)
user_df = user_df[user_df['course_order'].apply(lambda x: len(x) >= 5)]
df_results = user_df
df_results = df_results.drop(["name","enroll_time"], axis=1)

df_results["test_courses"] = df_results["course_order"].apply(get_test_courses)
df_results["train_courses"] = df_results["course_order"].apply(get_train_courses)


df_results["recommendations"] = df_results["train_courses"].apply(recommend_courses)
df_results.head()

Unnamed: 0,id,course_order,test_courses,train_courses,recommendations
0,U_7001215,"[C_course-v1:TsinghuaX+00740043_2x_2015_T2+sp,...","[C_course-v1:TsinghuaX+10421094X_2015_2+sp, C_...","[C_course-v1:TsinghuaX+00740043_2x_2015_T2+sp,...","[C_course-v1:TsinghuaX+00740043_1x+2019_T1, C_..."
3,U_7423998,"[C_course-v1:TsinghuaX+30240184_2X+sp, C_cours...","[C_course-v1:TsinghuaX+00690092X+sp, C_course-...","[C_course-v1:TsinghuaX+30240184_2X+sp, C_cours...","[C_course-v1:JSUX+2017011101X+sp, C_course-v1:..."
4,U_545306,"[C_course-v1:TsinghuaX+20430064_2X+sp, C_cours...","[C_course-v1:TsinghuaX+02070251X+sp, C_course-...","[C_course-v1:TsinghuaX+20430064_2X+sp, C_cours...","[C_course-v1:TsinghuaX+AP201601X+2019_T1, C_co..."
7,U_7594103,"[C_course-v1:TsinghuaX+30240184+sp, C_course-v...","[C_course-v1:TsinghuaX+00740043_2x_2015_T2+sp,...","[C_course-v1:TsinghuaX+30240184+sp, C_course-v...","[C_course-v1:TsinghuaX+30240184_2X+sp, C_cours..."
13,U_3234246,"[C_course-v1:TsinghuaX+00740123_X+sp, C_course...","[C_course-v1:BIT+PHY1701702+sp, C_course-v1:Ts...","[C_course-v1:TsinghuaX+00740123_X+sp, C_course...","[C_course-v1:SCUT+145055+sp, C_course-v1:SCUT+..."


In [102]:
df_results["test_courses"] = df_results["course_order"].apply(get_test_courses)
df_results["train_courses"] = df_results["course_order"].apply(get_train_courses)


In [109]:
num_users = 0
t_precision = 0
rank = 0
for index, row in df_results.iterrows():
    test_courses = set(row["test_courses"])
    recommendations = set(row["recommendations"][:5])
    # print(len(test_courses))
    precision = len(test_courses.intersection(recommendations))/len(test_courses)
    if len(test_courses) < 5:
        precision = len(test_courses.intersection(recommendations))/len(test_courses)
    else:
        precision = len(test_courses.intersection(recommendations))/5
    num_users += 1
    t_precision += precision
    
    r = 0
    for i in row["recommendations"]:
        r += 1
        if i in row["test_courses"]:
            break
    rank += 1/r

In [110]:
print((1/num_users)*rank)

0.14655114100514957


In [111]:
print(t_precision/num_users)

0.04522820784527112
