In [337]:
import math
import pandas as pd
import numpy as np
import os

In [338]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [339]:
USERS = 943
MOVIES = 1682
RATINGS = 100000
CORRECTION = 0.00000001
FOLDS = 5
HEADER = ['user_id', 'movie_id', 'rating', 'timestamp']
NEIGHBORS = [10 , 20 , 30 , 40 , 50]

In [340]:
def read_data(file_name , path = "Data/ml-100k/" ):
    data = pd.read_csv(os.path.join(path, file_name), '\t' , names = HEADER , encoding="ISO-8859-1")
    return data

In [341]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error , accuracy_score
def mae_calculation(actual , predicted):
    return mean_absolute_error(actual, predicted)

In [342]:
def k_fold_normal(fold_count):

    f = fold_count

    train_data = read_data("u" + str(f) + ".base")
    test_data = read_data("u" + str(f) + ".test")

    train_data_size = RATINGS // 5 * 4
    test_data_size = RATINGS // 5

    rating_data = np.zeros(shape = (USERS+1 , MOVIES+1) , dtype = int)

    for i in train_data.index:
        user = train_data['user_id'][i]
        movie = train_data['movie_id'][i]
        rating_data[user][movie] = train_data['rating'][i]

    similarity_users_matrix = cosine_similarity(rating_data)

    for i in range(1,USERS+1):
        similarity_users_matrix[i][i] = 0

    top_neighbors = np.zeros(shape = (USERS+1 , USERS+1) , dtype = int)

    for i in range(1,USERS+1):
        top_neighbors[i] = np.argsort(similarity_users_matrix[i])[::-1]

    average_rating = np.zeros(USERS+1)

    for i in range(1,USERS+1):
        total_rating_sum = np.sum(rating_data[i])
        rated_count = np.count_nonzero(rating_data[i])
        average_rating[i] = total_rating_sum / rated_count

    actual_rating = np.zeros(test_data_size)

    rating_count = 0
    for i in test_data['rating']:
        actual_rating[rating_count] = i
        rating_count+=1

    predicted_rating = np.zeros(shape = (FOLDS+1 , test_data_size) , dtype=float)

    rating_count = 0
    for i in test_data.index:

        active_user = test_data['user_id'][i]
        active_movie = test_data['movie_id'][i]

        j = 0
        neighbor_count = 0

        num = 0
        den = 0

        while j < len(top_neighbors[active_user]) - 1 :

            if neighbor_count%10 == 0 and neighbor_count!=0:

                k = neighbor_count // 10
                pred = average_rating[active_user]

                if den!=0:
                    pred += num/den

                predicted_rating[k][rating_count] = pred

                if k == FOLDS:
                    break

            current_user = top_neighbors[active_user][j]
            similarity = similarity_users_matrix[active_user][current_user]

            if rating_data[current_user][active_movie] == 0:
                j+=1
                continue
            
            average_diff = rating_data[current_user][active_movie] - average_rating[current_user]

            num += (similarity * average_diff)
            den += similarity

            j+=1
            neighbor_count+=1

        if neighbor_count <= 50:

            a = math.ceil(neighbor_count/10)
            b = average_rating[active_user]

            if den!=0:
                b += num/den

            for f in range(a,FOLDS+1):
                predicted_rating[f][rating_count] = b

        rating_count+=1

    return actual_rating , predicted_rating

In [343]:
def find_corated(List1 , List2):
    return np.count_nonzero(np.multiply(List1 , List2))

In [344]:
def k_fold_significance(fold_count):

    f = fold_count

    train_data = read_data("u" + str(f) + ".base")
    test_data = read_data("u" + str(f) + ".test")

    train_data_size = RATINGS // 5 * 4
    test_data_size = RATINGS // 5

    rating_data = np.zeros(shape = (USERS+1 , MOVIES+1) , dtype = int)

    for i in train_data.index:
        user = train_data['user_id'][i]
        movie = train_data['movie_id'][i]
        rating_data[user][movie] = train_data['rating'][i]

    corated_movies = np.zeros(shape = (USERS+1 , USERS+1) , dtype = int)

    for i in range(1 , USERS):
        for j in range(i+1 , USERS+1):
            corated_movies[i][j] = find_corated(rating_data[i] , rating_data[j])
            corated_movies[j][i] = corated_movies[i][j]

    similarity_users_matrix = cosine_similarity(rating_data)

    for i in range(1,USERS+1):
        similarity_users_matrix[i][i] = 0

    top_neighbors = np.zeros(shape = (USERS+1 , USERS+1) , dtype = int)

    for i in range(1,USERS+1):
        top_neighbors[i] = np.argsort(similarity_users_matrix[i])[::-1]

    average_rating = np.zeros(USERS+1)

    for i in range(1,USERS+1):
        total_rating_sum = np.sum(rating_data[i])
        rated_count = np.count_nonzero(rating_data[i])
        average_rating[i] = total_rating_sum / rated_count

    actual_rating = np.zeros(test_data_size)

    rating_count = 0
    for i in test_data['rating']:
        actual_rating[rating_count] = i
        rating_count+=1

    predicted_rating = np.zeros(shape = (FOLDS+1 , test_data_size) , dtype=float)

    rating_count = 0
    for i in test_data.index:

        active_user = test_data['user_id'][i]
        active_movie = test_data['movie_id'][i]

        j = 0
        neighbor_count = 0

        num = 0
        den = 0

        threshold = 15

        while j < len(top_neighbors[active_user]) - 1 :

            if neighbor_count%10 == 0 and neighbor_count!=0:

                k = neighbor_count // 10
                pred = average_rating[active_user]

                if den!=0:
                    pred += num/den

                predicted_rating[k][rating_count] = pred

                if k == FOLDS:
                    break

            current_user = top_neighbors[active_user][j]
            similarity = similarity_users_matrix[active_user][current_user]

            if rating_data[current_user][active_movie] == 0:
                j+=1
                continue

            significance = 1

            if corated_movies[active_user][current_user] < threshold:
                significance = corated_movies[active_user][current_user] / threshold
            
            average_diff = rating_data[current_user][active_movie] - average_rating[current_user]

            num += (similarity * average_diff * significance)
            den += similarity

            j+=1
            neighbor_count+=1

        if neighbor_count <= 50:

            a = math.ceil(neighbor_count/10)
            b = average_rating[active_user]

            if den!=0:
                b += num/den

            for f in range(a,FOLDS+1):
                predicted_rating[f][rating_count] = b

        rating_count+=1

    return actual_rating , predicted_rating

In [345]:
def output_results(_type):

    col_heads = []
    for i in range(1,FOLDS+1):
        st = "K=" + str(i*10)
        col_heads.append(st)
    col_heads.append('Average')

    row_heads = []
    for i in range(1,FOLDS+1):
        st = "Fold " + str(i)
        row_heads.append(st)
    row_heads.append('Average')

    result = pd.DataFrame(columns=col_heads)

    di = {}
    for i in col_heads:
        di[i] = 0

    for i in range(1,FOLDS+1):
        
        result = result.append(di , ignore_index=True)
        mae = []

        a,b = -1,-1

        if _type == "normal":
            a,b = k_fold_normal(i)
        elif _type == "significance":
            a,b = k_fold_significance(i)
        else:
            print("Wrong Type")
            return

        for j in range(1,FOLDS+1):
            mae.append(mae_calculation(a,np.round(b[j])))

        mae_sum = sum(mae)
        mae_av = mae_sum / FOLDS
        mae.append(mae_av)

        result.iloc[i-1] = mae

    result = result.append(di , ignore_index=True)
    nei_average = []

    for i in col_heads:
        val = 0
        for j in result[i]:
            val += j
        nei_average.append(val / FOLDS)

    result.iloc[FOLDS] = nei_average

    result.index = row_heads

    return result

In [346]:
normal_result = output_results("normal")

In [350]:
normal_result.at['Average' , 'Average'] = ""

In [356]:
show_normal_result = normal_result.style.set_caption("Table: Normal Cosine Similarity").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'cyan'),
        ('font-size', '20px')
    ]
}])
show_normal_result

Unnamed: 0,K=10,K=20,K=30,K=40,K=50,Average
Fold 1,0.7236,0.7158,0.71305,0.71345,0.7147,0.71612
Fold 2,0.71315,0.69995,0.6983,0.69985,0.70115,0.70248
Fold 3,0.71145,0.6992,0.6988,0.70005,0.70335,0.70257
Fold 4,0.7058,0.69695,0.69695,0.696,0.6994,0.69902
Fold 5,0.7183,0.70125,0.6998,0.70405,0.70285,0.70525
Average,0.71446,0.70263,0.70138,0.70268,0.70429,


In [348]:
significance_result = output_results("significance")

In [352]:
significance_result.at['Average' , 'Average'] = ""

In [353]:
show_significance_result = significance_result.style.set_caption("Table: Significance Weighting").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'cyan'),
        ('font-size', '20px')
    ]
}])
show_significance_result

Unnamed: 0,K=10,K=20,K=30,K=40,K=50,Average
Fold 1,0.7229,0.71865,0.7156,0.7166,0.71655,0.71806
Fold 2,0.71235,0.69995,0.6991,0.7003,0.7015,0.70264
Fold 3,0.71,0.6996,0.70045,0.7019,0.7059,0.70357
Fold 4,0.70505,0.69735,0.6967,0.69625,0.6989,0.69885
Fold 5,0.71835,0.70155,0.70075,0.70415,0.70355,0.70567
Average,0.71373,0.70342,0.70252,0.70384,0.70528,
