In [1]:
from transformers import BertTokenizer, BertModel
import torch
from sentence_transformers import SentenceTransformer

import pandas as pd
from bs4 import BeautifulSoup
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def get_course_embedding(course_description):
    inputs = tokenizer(course_description, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)  



In [3]:
course_df = pd.read_json("../data/MOOCCube/entities/course.json",lines=True)
# course_df = course_df.sample(100)
def remove_html_and_carriage_returns(text):
    soup = BeautifulSoup(text, "html.parser")
    text_without_html = soup.get_text().replace("\n","").replace("\r","").replace("\t","")
    
    return text_without_html

course_df["about"] = course_df["about"].apply(remove_html_and_carriage_returns)

In [4]:
course_embeddings = {}
for index, row in course_df.iterrows():
    id = row["id"]
    course_embedding = get_course_embedding(row["about"])
    course_embeddings[id] = course_embedding

In [5]:
def get_course_similarity(course_embedding1, course_embedding2):
    return cosine_similarity(course_embedding1, course_embedding2)

In [6]:
course_similarity_matrix = {}
for course_id1, embedding1 in course_embeddings.items():
    course_similarity_row = {}
    for course_id2, embedding2 in course_embeddings.items():
        similarity = get_course_similarity(embedding1, embedding2)
        course_similarity_row[course_id2] = similarity
    course_similarity_matrix[course_id1] = course_similarity_row


In [12]:
def recommend_courses(user_took_courses, num_recommendations=10):
    user_took_embeddings = [course_embeddings[course_id] for course_id in user_took_courses]
    user_mean_embedding = torch.stack(user_took_embeddings).mean(dim=0)
    
    course_scores = {}
    for course_id, course_embedding in course_embeddings.items():
        if course_id not in user_took_courses:
            similarity = get_course_similarity(user_mean_embedding, course_embedding)
            course_scores[course_id] = similarity
    
    sorted_courses = sorted(course_scores.items(), key=lambda x: x[1], reverse=True)
    recommended_courses = [course_id for course_id, _ in sorted_courses[:num_recommendations]]
    return recommended_courses

user_took_courses = { 'C_course-v1:TsinghuaX+02070251X+2019_T1', 'C_course-v1:TsinghuaX+60240013X+sp'} 
recommended_courses = recommend_courses(user_took_courses)


# print("Recomendaciones:", recommended_courses)


In [13]:
print(recommended_courses)

['C_course-v1:TsinghuaX+02070251X+sp', 'C_course-v1:TsinghuaX+20430054X1+sp', 'C_course-v1:TsinghuaX+20430054X_1+sp', 'C_course-v1:KMUSTX+8209001+sp', 'C_course-v1:TsinghuaX+AP201601X+2019_T1', 'C_course-v1:edX+BlendedX+sp', 'C_course-v1:RiceX+AdvENVSCI_1x+sp', 'C_course-v1:McGillX+ATOC185x+2015_T1', 'C_course-v1:KMUSTX+8209001+2019_T1', 'C_course-v1:TsinghuaX+34000312X+2019_T1']


Evaluation

In [8]:
user_df = pd.read_json("../data/MOOCCube/entities/user.json",lines=True)
user_df = user_df[user_df['course_order'].apply(lambda x: len(x) >= 5)]
per = 0.75
i = 0
evaluations=[]
for index, row in user_df.iterrows():
    elements = int(len(row["course_order"]) * per)
    train_list = row["course_order"][:elements]
    evaluation_list = row["course_order"][elements:]
    recommended_courses = recommend_courses(set(train_list), course_similarity_matrix,num_recommendations=15)
    set1 = set(recommended_courses)
    set2 = set(evaluation_list)
    set3 = set1.intersection(set2)
    evaluations.append({"user_id":row["id"],"matches":len(set3),"recomendations":len(recommended_courses),"train":len(train_list),"eval":len(evaluation_list)})
    i += 1


        

In [9]:
df_evaluations = pd.DataFrame(evaluations)
df_evaluations.sample(5)

Unnamed: 0,user_id,matches,recomendations,train,eval
30885,U_8176803,1,15,4,2
21334,U_10708352,1,15,5,2
9962,U_7950080,0,15,3,2
334,U_6687763,0,15,3,2
34848,U_245225,0,15,3,2


In [10]:
df_evaluations.sample(5)

Unnamed: 0,user_id,matches,recomendations,train,eval
4624,U_7947229,0,15,21,7
5190,U_887326,0,15,7,3
27787,U_10792368,0,15,4,2
28311,U_11597961,0,15,3,2
12162,U_1046984,1,15,6,2


In [11]:
df_evaluations.shape

(34917, 5)

In [12]:
df_evaluations.to_csv('archivo.csv', index=False)

In [13]:
df_evaluations = pd.read_csv("archivo.csv")

In [14]:
df_evaluations.head(5)

Unnamed: 0,user_id,matches,recomendations,train,eval
0,U_7001215,0,15,3,2
1,U_7423998,0,15,5,2
2,U_545306,0,15,7,3
3,U_7594103,1,15,4,2
4,U_3234246,0,15,3,2


In [15]:
count = 0
pre = 0
for index, row in df_evaluations.iterrows():
    pre = pre +(row["matches"]/(row["eval"]))
    count +=1
print(pre)
print(count)

2843.2611924458715
34917


In [16]:
precision = pre/count

In [17]:
precision

0.08142913745298484