## Collaborative Filtering Assignment 1
### Name: Shaney Waris
### Roll no.: 2018308

In [None]:
# I have used these 4 libraries.
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error

In [None]:
# Training Dataset.
u1Base_df = pd.read_csv('./Dataset/ml-100k/u1.base', sep='\t', names=['userId', 'itemId', 'Rating', 'Timestamp'])
u2Base_df = pd.read_csv('./Dataset/ml-100k/u2.base', sep='\t', names=['userId', 'itemId', 'Rating', 'Timestamp'])
u3Base_df = pd.read_csv('./Dataset/ml-100k/u3.base', sep='\t', names=['userId', 'itemId', 'Rating', 'Timestamp'])
u4Base_df = pd.read_csv('./Dataset/ml-100k/u3.base', sep='\t', names=['userId', 'itemId', 'Rating', 'Timestamp'])
u5Base_df = pd.read_csv('./Dataset/ml-100k/u4.base', sep='\t', names=['userId', 'itemId', 'Rating', 'Timestamp'])

In [None]:
# Testing Dataset.
u1Test_df = pd.read_csv('./Dataset/ml-100k/u1.test', sep='\t', names=['userId', 'itemId', 'Rating', 'Timestamp'])
u2Test_df = pd.read_csv('./Dataset/ml-100k/u2.test', sep='\t', names=['userId', 'itemId', 'Rating', 'Timestamp'])
u3Test_df = pd.read_csv('./Dataset/ml-100k/u3.test', sep='\t', names=['userId', 'itemId', 'Rating', 'Timestamp'])
u4Test_df = pd.read_csv('./Dataset/ml-100k/u4.test', sep='\t', names=['userId', 'itemId', 'Rating', 'Timestamp'])
u5Test_df = pd.read_csv('./Dataset/ml-100k/u5.test', sep='\t', names=['userId', 'itemId', 'Rating', 'Timestamp'])

## User Based Collaborative Filtering.

In [None]:
# Storing all the training & testing dataframes in a list. 
Train_df = [u1Base_df, u2Base_df, u3Base_df, u4Base_df, u5Base_df]
Test_df = [u1Test_df, u2Test_df, u3Test_df, u4Test_df, u5Test_df]

In [None]:
def predict_rating_UB(userId, itemId, train_pt_df, sm_df, tau):
    users_similarities = sm_df[userId]   # cosine similarities of user 'userId' with all other users.
    # Check if a completly new movie comes in my testing dataset.
    try:
        users_ratings = train_pt_df[itemId]  # ratings of all users for item 'itemId'
    except:
        return -1   # That movie doesn't exit in my training dataset. Ignore this case.
    
    # Not consider users with similarities less than threshold tau. (Here, similarity of the same user will also not considered.)
    drop_indices = users_similarities[users_similarities < tau].index
    users_ratings = users_ratings.drop(drop_indices)
    users_similarities = users_similarities.drop(drop_indices)
    
    # Not consider users who haven't rated the movie 'itemId'
    drop_indices = users_ratings[users_ratings == 0].index
    users_ratings = users_ratings.drop(drop_indices)
    users_similarities = users_similarities.drop(drop_indices)
    
    global coverage
    # If I encountered with coverage problem i.e, no similar users exist after threshold. Then take aveage rating of the movie.
    if len(users_similarities) == 0 or len(users_ratings) == 0:
        coverage = coverage + 1
        l = [x for x in list(train_pt_df[itemId]) if x != 0]
        return sum(l)/len(l)
    else:
        # Calculate weighted ratings 
        weighted_ratings = np.dot(users_ratings, users_similarities)

        # Normalized rating by sum of all the similarities. 
        users_similarities_sum = users_similarities.sum()

        # return the predicted rating.
        return weighted_ratings/users_similarities_sum

In [None]:
# This function takes the training & testing dataframes and return the MAE.
def UB_MAE(train_df, test_df, tau):
    # pivot the training dataframe.
    train_pt_df = pd.pivot_table(train_df, values='Rating', index='userId', columns='itemId')
    # Replace the NA values with 0   (Note: I observed -> No user in the whole dataset have rated 0 to any movie)
    train_pt_df = train_pt_df.fillna(0)
    # Calculate the cosine similarities of user-user.
    sm_df = pd.DataFrame(cosine_similarity(train_pt_df), index=train_pt_df.index, columns=train_pt_df.index)
    # Not consider the similarity of same user while predicting the rating. eg: similarity of user 1 with user 1.
    np.fill_diagonal(sm_df.values, 0)
    
    # Actual Ratings of testing dataframe.
    actual_ratings  = test_df['Rating']
    # Start predicting the ratings of testing dataframe.
    predicted_ratings = []
    for userId, itemId in zip(test_df['userId'], test_df['itemId']):
        predicted_ratings.append(predict_rating_UB(userId, itemId, train_pt_df, sm_df, tau))
    
    # Ignore the case when predicted rating is -1. (Because no such users are available to predict the rating. #Coverage_Problem)
    new_actual_ratings = []
    new_predicted_ratings = []
    for i in range(0, len(predicted_ratings)):
        if predicted_ratings[i] == -1:
            continue
        else:
            new_actual_ratings.append(actual_ratings[i])
            new_predicted_ratings.append(predicted_ratings[i])

    # return the MAE between Actual Ratings & Predicted Ratings.
    return mean_absolute_error(new_actual_ratings, new_predicted_ratings)

In [None]:
# For all the threshold values.
for tau in [0.4, 0.5, 0.6, 0.7]:
    print("\n*** For tau =", tau, "***")
    cross_validation = []
    # 5 fold Cross Validation.
    for i in range(0, 5):
        coverage = 0
        trainDataset = Train_df[i] # 1 training dataframe.
        testDataset = Test_df[i]   # 1 testing dataframe.
        mae_value = UB_MAE(trainDataset, testDataset, tau)  # Returning MAE for each fold.
        cross_validation.append(mae_value)
        print("FOLD " + str(i+1) + ": Taking u"+ str(i+1) + ".test as Testing DataSet, MAE =", mae_value, ", Coverage = " + str(100-((coverage*100)/20000)) + "%")
    print("MEAN Value =", sum(cross_validation)/5)


*** For tau = 0.4 ***
FOLD 1: Taking u1.test as Testing DataSet, MAE = 0.8475006447582307 , Coverage = 48.655%
FOLD 2: Taking u2.test as Testing DataSet, MAE = 0.8478761926090771 , Coverage = 63.47%
FOLD 3: Taking u3.test as Testing DataSet, MAE = 0.8397190856313358 , Coverage = 61.255%
FOLD 4: Taking u4.test as Testing DataSet, MAE = 0.8203078252670899 , Coverage = 68.91%
FOLD 5: Taking u5.test as Testing DataSet, MAE = 0.8186155541110084 , Coverage = 61.67%
MEAN Value = 0.8348038604753484

*** For tau = 0.5 ***
FOLD 1: Taking u1.test as Testing DataSet, MAE = 0.836024471030855 , Coverage = 7.969999999999999%
FOLD 2: Taking u2.test as Testing DataSet, MAE = 0.8345119280063997 , Coverage = 15.825000000000003%
FOLD 3: Taking u3.test as Testing DataSet, MAE = 0.8266754201144628 , Coverage = 16.810000000000002%
FOLD 4: Taking u4.test as Testing DataSet, MAE = 0.8105526572396692 , Coverage = 24.200000000000003%
FOLD 5: Taking u5.test as Testing DataSet, MAE = 0.8096246972385351 , Coverage

## Item Based Collaborative Filtering.

In [None]:
def predict_rating_IB(userId, itemId, train_pt_df, sm_df, K):
    try:
        items_similarities = sm_df[itemId]  # cosine similarities of item 'itemId' with all other items.
    except:
        return -1
    users_ratings = train_pt_df[userId]  # ratings of all items for user 'userId'
    
    # Consider only highest K item similarities. (Here, similarity of the same item will also not considered).
    d = dict(items_similarities) # {itemId : item_similarity}
    d_sorted = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
    drop_rest_users = list(d_sorted.keys())
    drop_rest_users = drop_rest_users[K:]
    users_ratings = users_ratings.drop(drop_rest_users)
    items_similarities = items_similarities.drop(drop_rest_users)
    
    # Not consider items whose ratings are NA.
    drop_empty_ratings = users_ratings[users_ratings == 0].index
    users_ratings = users_ratings.drop(drop_empty_ratings)
    items_similarities = items_similarities.drop(drop_empty_ratings)
    
    global coverage
    # # If I encountered with coverage problem i.e, no similar items exist after threshold. Then take aveage rating of the movie.
    if len(users_ratings) == 0 or len(items_similarities) == 0:
        coverage = coverage + 1
        ll = [x for x in list(train_pt_df.T[itemId]) if x != 0]
        return sum(ll)/len(ll)
    else:
        # Normalize all the similarities of all the items.
        items_similarities_sum = sum(items_similarities)
        items_similarities = [i/items_similarities_sum for i in items_similarities]

        # Linearly interpolate the Active User 'userId' rated items by corresponding normalized similarities.
        linear_interpolation = np.dot(users_ratings, items_similarities)

        # return this predicted rating.
        return linear_interpolation

In [None]:
# This function takes the training & testing dataframes and return the MAE.
def IB_MAE(train_df, test_df, K):
    # pivot the training dataframe and Transpose it.
    train_pt_df = pd.pivot_table(train_df, values='Rating', index='userId', columns='itemId').T
    # Replace the NA values with 0   (Note: I observed -> No user in the whole dataset have rated 0 to any movie)
    train_pt_df = train_pt_df.fillna(0)
    # Calculate the cosine similarities of item-item.
    sm_df = pd.DataFrame(cosine_similarity(train_pt_df), index=train_pt_df.index, columns=train_pt_df.index)
    # Not consider the similarity of same item while predicting the rating. eg: similarity of item 1 with item 1.
    np.fill_diagonal(sm_df.values, 0)
    
    # Actual Ratings of testing dataframe.
    actual_ratings  = test_df['Rating']
    # Start predicting the ratings of testing dataframe.
    predicted_ratings = []
    for userId, itemId in (zip(test_df['userId'], test_df['itemId'])):
        predicted_ratings.append(predict_rating_IB(userId, itemId, train_pt_df, sm_df, K))
        
    # Ignore the case when predicted rating is -1. (Because no such items are available to predict the rating. #Coverage_Problem)
    new_actual_ratings = []
    new_predicted_ratings = []
    for i in range(0, len(predicted_ratings)):
        if predicted_ratings[i] == -1:
            continue
        else:
            new_actual_ratings.append(actual_ratings[i])
            new_predicted_ratings.append(predicted_ratings[i])
    
    # return the MAE between Actual Ratings & Predicted Ratings.
    return mean_absolute_error(new_actual_ratings, new_predicted_ratings)

In [None]:
# For all the threshold values.
for K in [10, 20, 30, 40]:
    print("\n*** For K =", K, "***")
    cross_validation = []
    # 5 fold Cross Validation.
    for i in range(0, 5):
        coverage = 0
        trainDataset = Train_df[i]  # 1 training dataframe.
        testDataset = Test_df[i]    # 1 testing dataframe.
        mae_value = IB_MAE(trainDataset, testDataset, K)   # Returning MAE for each fold.
        cross_validation.append(mae_value)
        print("FOLD " + str(i+1) + ": Taking u"+ str(i+1) + ".test as Testing DataSet, MAE =", mae_value, ", Coverage = " + str(100-((coverage*100)/20000)) + "%")
    print("MEAN Value =", sum(cross_validation)/5)


*** For K = 10 ***
FOLD 1: Taking u1.test as Testing DataSet, MAE = 0.8065361949096769 , Coverage = 88.09%
FOLD 2: Taking u2.test as Testing DataSet, MAE = 0.7939964674859856 , Coverage = 90.93%
FOLD 3: Taking u3.test as Testing DataSet, MAE = 0.7797270876387902 , Coverage = 91.13%
FOLD 4: Taking u4.test as Testing DataSet, MAE = 0.7696899678196355 , Coverage = 95.45%
FOLD 5: Taking u5.test as Testing DataSet, MAE = 0.7666360317959755 , Coverage = 94.355%
MEAN Value = 0.7833171499300129

*** For K = 20 ***
FOLD 1: Taking u1.test as Testing DataSet, MAE = 0.7884116848570262 , Coverage = 94.475%
FOLD 2: Taking u2.test as Testing DataSet, MAE = 0.7699057629281751 , Coverage = 95.77%
FOLD 3: Taking u3.test as Testing DataSet, MAE = 0.7681496007174117 , Coverage = 95.99%
FOLD 4: Taking u4.test as Testing DataSet, MAE = 0.760143479878848 , Coverage = 98.275%
FOLD 5: Taking u5.test as Testing DataSet, MAE = 0.7529098809777055 , Coverage = 97.965%
MEAN Value = 0.7679040818718332

*** For K = 