In [107]:
import pandas as pd
import numpy as np
import math
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import json
from collections import ChainMap

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [108]:
with open('./allCourses.json', 'r') as f:
    data = json.load(f)

def fa(x):
    return { x["key"] + "02" : (int(x["regular_semester"][0]) - 1) / 5.0 }

regular_sem = pd.Series(data=dict(ChainMap(*map(fa, data))))

In [109]:
course_codes = ["KP", "RR", "AI", "A2", "BS", "TI", "T2", "PV", "P2", "MM", "DS", "D2", "SE", "KI", "CB", "HC", "H2", "IT", "CG", "ES", "RI", "CS", "GD", "FP", "DL", "PR", "BP", "SR", "BA", "K1", "IU"]
course_rating_codes = list()
course_grade_codes = list()
course_semester_codes = list()
course_data_codes = list()

for c in course_codes:
    course_data_codes.append(c + "01_01")
    course_data_codes.append(c + "02")
    course_data_codes.append(c + "03")

    course_grade_codes.append(c + "01_01")
    course_semester_codes.append(c + "02")
    course_rating_codes.append(c + "03")

temp = course_data_codes + ["FINISHED"]

data = pd.read_csv('data_tutorial316936_2022-06-15_10-39.csv', encoding = "ISO-8859-1")

filtered = pd.DataFrame()
for i in temp:
    if i in data.columns:
        filtered[i] = data[i]
    else:
        filtered[i] = np.NAN

filtered = filtered.loc[filtered["FINISHED"] == "1"].drop(["FINISHED"], axis=1)

exists = pd.DataFrame()
exists_all = pd.DataFrame()
for i in course_codes:
    exists[i] = filtered[[i + "01_01", i + "02", i + "03"]].any(axis=1)
    exists_all[i + "01_01"] = exists[i]
    exists_all[i + "02"] = exists[i]
    exists_all[i + "03"] = exists[i]

In [110]:
#labels = pd.read_csv('values_tutorial316936_2022-06-15_11-18.csv', encoding = "ISO-8859-1")
#filtered

In [111]:
#Normalization
grades = [1.0, 1.3, 1.7, 2.0, 2.3, 2.7, 3.0, 3.3, 3.7, 4.0, 5.0]

def grade(i):
    if math.isnan(float(i)) or i == "-9":
        return np.NAN
    return (grades[int(i) - 1] - 1.0) / 4.0
    
def semester(i):
    if math.isnan(float(i)) or i == "-9":
        return np.NAN
    return (int(i) - 1) / 5.0#np.exp(int(i) * np.log(2.0) / 6.0) - 1.0 #

def like(i):
    if math.isnan(float(i)) or i == "-9":
        return np.NAN
    return (int(i) - 1) / 2

for c in course_codes:
    c_grade = str(c) + "01_01"
    c_semester = str(c) + "02"
    c_like = str(c) + "03"
    filtered[c_grade] = filtered[c_grade].map(grade)
    filtered[c_semester] = filtered[c_semester].map(semester)
    filtered[c_like] = filtered[c_like].map(like)

In [112]:
semesters = filtered.loc[:, course_semester_codes]

def fill(x):
    m = x.max()
    cp = regular_sem.copy()
    cp[cp < m] = m

    x[x.isna()] = cp[x.isna()]
    
    return x

fi = filtered.copy()
fi.loc[:, course_semester_codes] = semesters.apply(fill, axis=1)
fi = fi.fillna(0.5)

In [113]:
knn = NearestNeighbors(n_neighbors=4)
knn.fit(fi)

In [135]:
def semester_rec(current, other):
    common = np.exp(5*(other-current))
    return 1.0 - 0.9 * common / (1.0 + common)

sem = pd.DataFrame()

def get_recommendations(user_data):
    current = pd.Series(user_data[0])[course_semester_codes].max()

    distances, neighbour_users = knn.kneighbors(user_data)
    
    similarity = 1 / (distances + 1)

    a = exists_all.iloc[neighbour_users.flatten()].loc[:, course_rating_codes]
    a = a.loc[:, a.any()]
    for i in a.columns:
        a[i[0:2]] = a[i]
    a = a.drop(set(course_rating_codes).intersection(set(a.columns)), axis=1)
    a = a.replace(False, 0.0).replace(True, 1)

    #print(a)

    temp = filtered.iloc[neighbour_users.flatten()].loc[:, exists_all.iloc[neighbour_users.flatten()].any()]
    temp2 = pd.DataFrame()
    temp_grade = pd.DataFrame()
    #temp_sem = pd.DataFrame()
    for i in course_codes:
        if i + "01_01" in temp.columns:
            temp2[i] = temp[[i + "03"]]
            temp_grade[i] = 1.0 - temp[[i + "01_01"]]
            sem[i] = temp[[i + "02"]].apply(lambda x: semester_rec(current, x))
    
    print(sem)
            #temp_sem[i] = 
    #temp2[temp2.notna()] += 0.2
    #print((temp2.fillna(0.0) + temp_grade.fillna(0.0)) / 2.0)

    qwe = pd.DataFrame()
    rtz = pd.DataFrame()
    for col in temp2.columns:
        qwe[col] = ((temp2[col] + temp_grade[col] + sem[col]) / 2.0).fillna(0.0).dot(similarity.T)
        #rtz[col] = .fillna(0.0).dot(similarity.T)
    
    #print(temp_grade)

    #print(temp2[col].fillna(0.0).dot(similarity.T) + temp_grade[col].fillna(0.0).dot(similarity.T))

    

    asdf = pd.DataFrame()
    for col in a.columns:
        asdf[col] = a[col].fillna(0.0).dot(similarity.T)

    #print(asdf)

    results = (qwe / asdf).iloc[0].sort_values(ascending=False)
    return results

In [136]:
get_recommendations([fi.loc[2]])

          KP        RR        AI        BS  TI        PV        DS        HC  \
2        NaN       NaN       NaN       NaN NaN       NaN       NaN       NaN   
13  0.993976  0.993976       NaN       NaN NaN       NaN       NaN       NaN   
10  0.993976  0.993976  0.983812  0.957317 NaN  0.957317  0.983812  0.983812   
12  0.993976  0.993976  0.983812  0.957317 NaN  0.993976  0.983812       NaN   

          H2  RI        DL        BP        K1  
2   0.993976 NaN  0.993976       NaN       NaN  
13       NaN NaN       NaN       NaN       NaN  
10       NaN NaN  0.993976       NaN  0.957317  
12       NaN NaN  0.993976  0.957317  0.957317  




RR    0.936139
DL    0.929984
HC    0.750000
K1    0.742641
AI    0.684557
KP    0.656219
PV    0.621321
H2    0.500000
DS    0.496321
BP    0.462500
BS    0.363962
TI    0.000000
RI    0.000000
Name: 0, dtype: float64

In [138]:
def pr_sem(sem):
    if sem == 0:
        return 1.0
    return (int(sem) - 1) / 5.0#np.exp(int(sem) * np.log(2.0) / 6.0) - 1.0 #

def pr_like(like):
    return (2 - like) / 2.0

def pr_grade(grade):
    return (grade - 1.0) / 4.0

def process_input(x):
    df = pd.Series(index=course_data_codes, dtype=np.float64)
    for c in x:
        df[c["course"] + "01_01"] = pr_grade(float(c["grade"]))
        df[c["course"] + "02"] = pr_sem(int(c["semester"]))
        df[c["course"] + "03"] = pr_like(int(c["like"]))

    df[course_semester_codes] = df[course_semester_codes].max()
    df = df.fillna(0.5)
    return df

In [142]:
query = [{"course":"KP","semester":1,"like":0,"grade":"1.7"},{"course":"RR","semester":2,"like":1,"grade":"2.0"}]
get_recommendations([process_input(query)])

          KP        RR    AI        BS  TI        PV    DS    HC        H2  \
2        NaN       NaN   NaN       NaN NaN       NaN   NaN   NaN  0.757953   
13  0.757953  0.757953   NaN       NaN NaN       NaN   NaN   NaN       NaN   
10  0.757953  0.757953  0.55  0.342047 NaN  0.342047  0.55  0.55       NaN   
12  0.757953  0.757953  0.55  0.342047 NaN  0.757953  0.55   NaN       NaN   

    RI        DL        BP        K1  
2  NaN  0.757953       NaN       NaN  
13 NaN       NaN       NaN       NaN  
10 NaN  0.757953       NaN  0.342047  
12 NaN  0.757953  0.342047  0.342047  




RR    0.942239
DL    0.939550
HC    0.750000
K1    0.740800
AI    0.683820
KP    0.667067
PV    0.620400
H2    0.500000
DS    0.495400
BP    0.462500
BS    0.361200
TI    0.000000
RI    0.000000
Name: 0, dtype: float64