In [95]:
import math
import pandas as pd
import numpy as np
import os

In [96]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [97]:
USERS = 943
MOVIES = 1682
RATINGS = 100000
CORRECTION = 0.00000001
FOLDS = 5
HEADER = ['user_id', 'movie_id', 'rating', 'timestamp']
NEIGHBORS = [10 , 20 , 30 , 40 , 50]

In [98]:
def read_data(file_name , path = "Data/ml-100k/" ):
    data = pd.read_csv(os.path.join(path, file_name), '\t' , names = HEADER , encoding="ISO-8859-1")
    return data

In [99]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error , accuracy_score
def mae_calculation(actual , predicted):
    return mean_absolute_error(actual, predicted)

In [100]:
def k_fold_normal(fold_count):

    f = fold_count

    train_data = read_data("u" + str(f) + ".base")
    test_data = read_data("u" + str(f) + ".test")

    train_data_size = RATINGS // 5 * 4
    test_data_size = RATINGS // 5

    rating_data = np.zeros(shape = (USERS+1 , MOVIES+1) , dtype = int)

    for i in train_data.index:
        user = train_data['user_id'][i]
        movie = train_data['movie_id'][i]
        rating_data[user][movie] = train_data['rating'][i]

    rating_data = rating_data.transpose()

    similarity_users_matrix = cosine_similarity(rating_data)

    for i in range(1,MOVIES+1):
        similarity_users_matrix[i][i] = 0

    top_neighbors = np.zeros(shape = (MOVIES+1 , MOVIES+1) , dtype = int)

    for i in range(1,MOVIES+1):
        top_neighbors[i] = np.argsort(similarity_users_matrix[i])[::-1]

    average_rating = np.zeros(MOVIES+1)

    for i in range(1,MOVIES+1):
        total_rating_sum = np.sum(rating_data[i])
        rated_count = np.count_nonzero(rating_data[i])
        if rated_count==0:
            average_rating[i] = 0
        else:
            average_rating[i] = total_rating_sum / rated_count

    actual_rating = np.zeros(test_data_size)

    rating_count = 0
    for i in test_data['rating']:
        actual_rating[rating_count] = i
        rating_count+=1

    predicted_rating = np.zeros(shape = (FOLDS+1 , test_data_size) , dtype=float)

    rating_count = 0
    for i in test_data.index:

        active_user = test_data['user_id'][i]
        active_movie = test_data['movie_id'][i]

        active_user , active_movie = active_movie , active_user

        j = 0
        neighbor_count = 0

        num = 0
        den = 0

        while j < len(top_neighbors[active_user]) - 1 :

            if neighbor_count%10 == 0 and neighbor_count!=0:

                k = neighbor_count // 10
                pred = average_rating[active_user]

                if den!=0:
                    pred += num/den

                predicted_rating[k][rating_count] = pred

                if k == FOLDS:
                    break

            current_user = top_neighbors[active_user][j]
            similarity = similarity_users_matrix[active_user][current_user]

            if rating_data[current_user][active_movie] == 0:
                j+=1
                continue
            
            average_diff = rating_data[current_user][active_movie] - average_rating[current_user]

            num += (similarity * average_diff)
            den += similarity

            j+=1
            neighbor_count+=1

        if neighbor_count <= 50:

            a = math.ceil(neighbor_count/10)
            b = average_rating[active_user]

            if den!=0:
                b += num/den

            for f in range(a,FOLDS+1):
                predicted_rating[f][rating_count] = b

        rating_count+=1

    return actual_rating , predicted_rating

In [101]:
def variance_weight_calulation(matrix , indx):

    min_var = min(i for i in matrix if i>0)
    max_var = np.amax(matrix)
    diff = max_var - min_var

    return (matrix[indx] - min_var) / (diff)

In [102]:
def k_fold_variance(fold_count):

    f = fold_count

    train_data = read_data("u" + str(f) + ".base")
    test_data = read_data("u" + str(f) + ".test")

    train_data_size = RATINGS // 5 * 4
    test_data_size = RATINGS // 5

    rating_data = np.zeros(shape = (USERS+1 , MOVIES+1) , dtype = int)

    for i in train_data.index:
        user = train_data['user_id'][i]
        movie = train_data['movie_id'][i]
        rating_data[user][movie] = train_data['rating'][i]

    rating_data = rating_data.transpose()

    similarity_users_matrix = cosine_similarity(rating_data)

    for i in range(1,MOVIES+1):
        similarity_users_matrix[i][i] = 0

    top_neighbors = np.zeros(shape = (MOVIES+1 , MOVIES+1) , dtype = int)

    for i in range(1,MOVIES+1):
        top_neighbors[i] = np.argsort(similarity_users_matrix[i])[::-1]

    average_rating = np.zeros(MOVIES+1)

    for i in range(1,MOVIES+1):
        total_rating_sum = np.sum(rating_data[i])
        rated_count = np.count_nonzero(rating_data[i])
        if rated_count==0:
            average_rating[i] = 0
        else:
            average_rating[i] = total_rating_sum / rated_count

    variance_matrix = np.zeros(MOVIES+1)

    for i in range(1,MOVIES+1):
        std_deviation = 0
        rated_count = np.count_nonzero(rating_data[i])
        if(rated_count > 0):
            std_deviation = rating_data[i][rating_data[i] != 0].std()
        variance_matrix[i] = std_deviation ** 2

    actual_rating = np.zeros(test_data_size)

    rating_count = 0
    for i in test_data['rating']:
        actual_rating[rating_count] = i
        rating_count+=1

    predicted_rating = np.zeros(shape = (FOLDS+1 , test_data_size) , dtype=float)

    rating_count = 0
    for i in test_data.index:

        active_user = test_data['user_id'][i]
        active_movie = test_data['movie_id'][i]

        active_user , active_movie = active_movie , active_user

        j = 0
        neighbor_count = 0

        num = 0
        den = 0

        while j < len(top_neighbors[active_user]) - 1 :

            if neighbor_count%10 == 0 and neighbor_count!=0:

                k = neighbor_count // 10
                pred = average_rating[active_user]

                if den!=0:
                    pred += num/den

                predicted_rating[k][rating_count] = pred

                if k == FOLDS:
                    break

            current_user = top_neighbors[active_user][j]
            similarity = similarity_users_matrix[active_user][current_user]

            if rating_data[current_user][active_movie] == 0:
                j+=1
                continue

            variance_weight = variance_weight_calulation(variance_matrix , current_user)
            
            average_diff = rating_data[current_user][active_movie] - average_rating[current_user]

            num += (similarity * average_diff * variance_weight)
            den += (similarity * variance_weight)

            j+=1
            neighbor_count+=1

        if neighbor_count <= 50:

            a = math.ceil(neighbor_count/10)
            b = average_rating[active_user]

            if den!=0:
                b += num/den

            for f in range(a,FOLDS+1):
                predicted_rating[f][rating_count] = b

        rating_count+=1

    return actual_rating , predicted_rating

In [103]:
def output_results(_type):

    col_heads = []
    for i in range(1,FOLDS+1):
        st = "K=" + str(i*10)
        col_heads.append(st)
    col_heads.append('Average')

    row_heads = []
    for i in range(1,FOLDS+1):
        st = "Fold " + str(i)
        row_heads.append(st)
    row_heads.append('Average')

    result = pd.DataFrame(columns=col_heads)

    di = {}
    for i in col_heads:
        di[i] = 0

    for i in range(1,FOLDS+1):
        
        result = result.append(di , ignore_index=True)
        mae = []

        a,b = -1,-1

        if _type == "normal":
            a,b = k_fold_normal(i)
        elif _type == "variance":
            a,b = k_fold_variance(i)
        else:
            print("Wrong Type")
            return

        for j in range(1,FOLDS+1):
            mae.append(mae_calculation(a,np.round(b[j])))

        mae_sum = sum(mae)
        mae_av = mae_sum / FOLDS
        mae.append(mae_av)

        result.iloc[i-1] = mae

    result = result.append(di , ignore_index=True)
    nei_average = []

    for i in col_heads:
        val = 0
        for j in result[i]:
            val += j
        nei_average.append(val / FOLDS)

    result.iloc[FOLDS] = nei_average

    result.index = row_heads

    return result

In [104]:
normal_result = output_results("normal")

In [105]:
normal_result.at['Average' , 'Average'] = ""

In [110]:
show_normal_result = normal_result.style.set_caption("Table: Normal Cosine Similarity").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'cyan'),
        ('font-size', '20px')
    ]
}])
show_normal_result

Unnamed: 0,K=10,K=20,K=30,K=40,K=50,Average
Fold 1,0.695,0.691,0.69485,0.69565,0.69925,0.69515
Fold 2,0.6849,0.67925,0.68075,0.68335,0.6859,0.68283
Fold 3,0.6897,0.6866,0.6878,0.68975,0.69075,0.68892
Fold 4,0.6848,0.68235,0.685,0.68585,0.6878,0.68516
Fold 5,0.6937,0.6895,0.69045,0.69255,0.69515,0.69227
Average,0.68962,0.68574,0.68777,0.68943,0.69177,


In [107]:
variance_result = output_results("variance")

In [108]:
variance_result.at['Average' , 'Average'] = ""

In [109]:
show_variance_result = variance_result.style.set_caption("Table: Variance Weighting").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'cyan'),
        ('font-size', '20px')
    ]
}])
show_variance_result

Unnamed: 0,K=10,K=20,K=30,K=40,K=50,Average
Fold 1,0.6959,0.69045,0.6944,0.6953,0.6979,0.69479
Fold 2,0.6867,0.6802,0.6822,0.68415,0.68685,0.68402
Fold 3,0.69275,0.68715,0.687,0.6886,0.6897,0.68904
Fold 4,0.68975,0.6826,0.68435,0.6854,0.68745,0.68591
Fold 5,0.69365,0.6889,0.6894,0.6909,0.692,0.69097
Average,0.69175,0.68586,0.68747,0.68887,0.69078,
