In [45]:
import pandas as pd
import numpy as np
import math
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import json
from collections import ChainMap

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [46]:
with open('./allCourses.json', 'r') as f:
    data = json.load(f)

def fa(x):
    return { x["key"] + "02" : (int(x["regular_semester"][0]) - 1) / 5.0 }

regular_sem = pd.Series(data=dict(ChainMap(*map(fa, data))))

In [47]:
course_codes = ["KP", "RR", "AI", "A2", "BS", "TI", "T2", "PV", "P2", "MM", "DS", "D2", "SE", "KI", "CB", "HC", "H2", "IT", "CG", "ES", "RI", "CS", "GD", "FP", "DL", "PR", "BP", "SR", "BA", "K1", "IU"]
course_rating_codes = list()
course_grade_codes = list()
course_semester_codes = list()
course_data_codes = list()

for c in course_codes:
    course_data_codes.append(c + "01_01")
    course_data_codes.append(c + "02")
    course_data_codes.append(c + "03")

    course_grade_codes.append(c + "01_01")
    course_semester_codes.append(c + "02")
    course_rating_codes.append(c + "03")

temp = course_data_codes + ["FINISHED"]

data = pd.read_csv('data_tutorial316936_2022-06-15_10-39.csv', encoding = "ISO-8859-1")

filtered = pd.DataFrame()
for i in temp:
    if i in data.columns:
        filtered[i] = data[i]
    else:
        filtered[i] = np.NAN

filtered = filtered.loc[filtered["FINISHED"] == "1"].drop(["FINISHED"], axis=1)

exists = pd.DataFrame()
exists_all = pd.DataFrame()
for i in course_codes:
    exists[i] = filtered[[i + "01_01", i + "02", i + "03"]].any(axis=1)
    exists_all[i + "01_01"] = exists[i]
    exists_all[i + "02"] = exists[i]
    exists_all[i + "03"] = exists[i]

In [48]:
#labels = pd.read_csv('values_tutorial316936_2022-06-15_11-18.csv', encoding = "ISO-8859-1")
#filtered

In [49]:
#Normalization
grades = [1.0, 1.3, 1.7, 2.0, 2.3, 2.7, 3.0, 3.3, 3.7, 4.0, 5.0]

def grade(i):
    if math.isnan(float(i)) or i == "-9":
        return np.NAN
    return (grades[int(i) - 1] - 1.0) / 4.0
    
def semester(i):
    if math.isnan(float(i)) or i == "-9":
        return np.NAN
    return (int(i) - 1) / 5.0#np.exp(int(i) * np.log(2.0) / 6.0) - 1.0 #

def like(i):
    if math.isnan(float(i)) or i == "-9":
        return np.NAN
    return (int(i) - 1) / 2

for c in course_codes:
    c_grade = str(c) + "01_01"
    c_semester = str(c) + "02"
    c_like = str(c) + "03"
    filtered[c_grade] = filtered[c_grade].map(grade)
    filtered[c_semester] = filtered[c_semester].map(semester)
    filtered[c_like] = filtered[c_like].map(like)

In [50]:
semesters = filtered.loc[:, course_semester_codes]

def fill(x):
    m = x.max()
    cp = regular_sem.copy()
    cp[cp < m] = m

    x[x.isna()] = cp[x.isna()]
    
    return x

fi = filtered.copy()
fi.loc[:, course_semester_codes] = semesters.apply(fill, axis=1)
fi = fi.fillna(0.5)

In [51]:
knn = NearestNeighbors(n_neighbors=4)
knn.fit(fi)

In [81]:
def semester_rec(current, other):
    common = np.exp(12*(other-current))
    return 1.0 - 0.9 * common / (1.0 + common)

sem = pd.DataFrame()

def get_recommendations(user_data):

    
    current = user_data[0]

    distances, neighbour_users = knn.kneighbors(user_data)
    
    similarity = 1 / (distances + 1)

    a = exists_all.iloc[neighbour_users.flatten()].loc[:, course_rating_codes]
    a = a.loc[:, a.any()]
    for i in a.columns:
        a[i[0:2]] = a[i]
    a = a.drop(set(course_rating_codes).intersection(set(a.columns)), axis=1)
    a = a.replace(False, 0.0).replace(True, 1)

    #print(a)

    temp = filtered.iloc[neighbour_users.flatten()].loc[:, exists_all.iloc[neighbour_users.flatten()].any()]
    temp2 = pd.DataFrame()
    temp_grade = pd.DataFrame()
    #temp_sem = pd.DataFrame()
    for i in course_codes:
        if i + "01_01" in temp.columns:
            temp2[i] = temp[[i + "03"]]
            temp_grade[i] = 1.0 - temp[[i + "01_01"]]
            sem[i] = temp[[i + "02"]]
    
            #temp_sem[i] = 
    #temp2[temp2.notna()] += 0.2
    #print((temp2.fillna(0.0) + temp_grade.fillna(0.0)) / 2.0)

    qwe = pd.DataFrame()
    rtz = pd.DataFrame()
    for col in temp2.columns:
        qwe[col] = ((temp2[col] + temp_grade[col]) / 2.0).fillna(0.0).dot(similarity.T)
        #rtz[col] = .fillna(0.0).dot(similarity.T)
    
    #print(temp_grade)

    #print(temp2[col].fillna(0.0).dot(similarity.T) + temp_grade[col].fillna(0.0).dot(similarity.T))

    

    asdf = pd.DataFrame()
    for col in a.columns:
        asdf[col] = a[col].fillna(0.0).dot(similarity.T)

    #print(asdf)

    results = (qwe / asdf).iloc[0].sort_values(ascending=False)
    return results, current #/ 1.2

In [82]:
ratings, current = get_recommendations([fi.loc[2]])



In [83]:
sem

Unnamed: 0,KP,RR,AI,BS,TI,PV,DS,HC,H2,RI,DL,BP,K1
2,,,,,,,,,0.0,,0.0,,
13,0.0,0.0,,,,,,,,,,,
10,0.0,0.0,0.2,0.4,,0.4,0.2,0.2,,,0.0,,0.4
12,0.0,0.0,0.2,0.4,,0.0,0.2,,,,0.0,0.4,0.4


In [86]:
pd.Series(current)[course_semester_codes]

KP02    0.0
RR02    0.0
AI02    0.2
A202    0.4
BS02    0.4
TI02    0.4
T202    0.8
PV02    0.8
P202    0.4
MM02    0.0
DS02    0.0
D202    0.4
SE02    0.6
KI02    0.8
CB02    0.6
HC02    0.4
H202    0.0
IT02    0.4
CG02    0.4
ES02    0.6
RI02    0.6
CS02    0.6
GD02    0.4
FP02    0.4
DL02    0.0
PR02    0.8
BP02    0.4
SR02    0.8
BA02    1.0
K102    0.4
IU02    0.4
Name: 2, dtype: float64