In [1]:
%pylab inline
import turicreate as tc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
import random
from IPython.display import display
from scipy.spatial.distance import cosine
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity

#Import libraries, remember to download eventual missing packages with pip or conda

Populating the interactive namespace from numpy and matplotlib


In [2]:
#Helper functions:
def trainingOrTest(ranks, total, splitCriteria):
    #Helper function
    #Need ranks to be an ordered set according to split criteria
    return (ranks/total) > splitCriteria

def getRanks_and_TotalCount(dataset,columnNames):
    #Helper function
    #Returns 2-dimensional list where 0-index in inner list is ranks, and 1-index is totalcount
    return [[dataset.groupby(columnName)['timestamp']
            .rank(method='first'),dataset[columnName]
            .map(ratings_data
            .groupby(columnName)['timestamp']
            .apply(len))] for columnName in columnNames]

def dataset_testAndTrainingData(dataset, columnNames, splitCriteria):
    #Adds a column denoting wether the datapoint is training or test data, where training data is the <splitCriteria>/1 first part
    #Output is of size 1 * len(columnNames) in a tuple, in order of: columnName1-dataset, columnName1-dataset, ...
    columnNames_ranksAndTotalCount = getRanks_and_TotalCount(dataset, columnNames)
    resultTuple = ()
    n=0
    for columnName in columnNames: 
        current_rating = pd.concat([dataset, trainingOrTest(columnNames_ranksAndTotalCount[n][0],columnNames_ranksAndTotalCount[n][1],splitCriteria)], axis=1, names=['testData'])
        current_rating.rename(columns = {0:'testData'}, inplace=True)
        resultTuple = resultTuple + (current_rating,)
        n+=1
    return resultTuple

def mergeDatasets(mainDataset, listOfTuple_datasetAndColumn):
    #merges mainDataset with a list of other datasets and their shared columns
    for tupleInList in listOfTuple_datasetAndColumn:
        mainDataset = pd.merge(mainDataset,tupleInList[0],on=tupleInList[1])
    return mainDataset

In [3]:
#read data
ratings_data = pd.read_csv('./u.data', sep='\t', names=['user_id','item_id','rating','timestamp'])
movie_titles_data = pd.read_csv('./u.item', sep='|', names=['item_id','movie_title','release_date'],encoding='latin_1', usecols=[0,1,2])
users_data = pd.read_csv('./u.user', sep='|', names=['user_id','age','gender','occupation'], usecols=[0,1,2,3])

#Label data as test or training data
user_rating, item_rating = dataset_testAndTrainingData(ratings_data, ['user_id','item_id'], 0.7) 

#Join movie titles and users with ratings
user_rating = mergeDatasets(user_rating, [(movie_titles_data,'item_id'),(users_data,'user_id')])
item_rating = mergeDatasets(item_rating, [(movie_titles_data,'item_id'),(users_data,'user_id')])

#Make test and training datasets
item_test = item_rating[item_rating.testData == 1]
item_training = item_rating[item_rating.testData == 0]
user_test = user_rating[user_rating.testData == 1]
user_training = user_rating[user_rating.testData == 0]

#Matrixify
item_training_matrix = item_training.pivot_table(index='movie_title', columns='user_id', values='rating')
item_training_matrix_num = item_training.pivot_table(index='item_id', columns='user_id', values='rating')
item_test_matrix = item_test.pivot_table(index='movie_title', columns='user_id', values='rating')
item_test_matrix_num = item_test.pivot_table(index='item_id', columns='user_id', values='rating')
user_training_matrix = user_training.pivot_table(index='user_id', columns='movie_title', values='rating')
user_test_matrix = user_test.pivot_table(index='user_id', columns='movie_title', values='rating')


#Add rating of 3 where data is in test but not in training
missing = item_test_matrix.columns.difference(item_training_matrix.columns)
item_training_matrix_complete = pd.concat([item_training_matrix,pd.DataFrame(columns=missing)])
item_training_matrix_complete[missing] = 3.0
item_training_matrix_complete_num = pd.concat([item_training_matrix_num,pd.DataFrame(columns=missing)])
item_training_matrix_complete_num[missing] = 3.0
missing = user_test_matrix.columns.difference(user_training_matrix.columns)
user_training_matrix_complete = pd.concat([user_training_matrix,pd.DataFrame(columns=missing)])
user_training_matrix_complete[missing] = 3.0

#Calculate mean and append as column
user_training_matrix_complete['mean'] = user_training_matrix_complete.mean(axis=1)
item_training_matrix['mean'] = item_training_matrix.mean(axis=1)
item_training_matrix_complete['mean'] = item_training_matrix_complete.mean(axis=1)
item_test_matrix['mean'] = item_test_matrix.mean(axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




## Question 1: 
 - RMSE at 2.4587

In [4]:
def RMSE(y_testDataSeries, y_trainingDataSeries):
    #Calculating RMSE, removing datafields that are only in the trainingDataset as we can only test values that are also in the testSet
    '''
    Considereing that the formula is measuring the difference between training dataframe and test dataframe, as dicussed in class
    '''
    trainingDataWithoutExcess = pd.concat([y_trainingDataSeries, y_testDataSeries], join='inner')
    res = (y_testDataSeries.subtract(trainingDataWithoutExcess) **2)
    res = res.sum(axis=1)
    return sqrt(res.sum()/(len(res.index)/2))


#Question Q1
answerToQ1 = RMSE(item_test_matrix,item_training_matrix_complete)

print(str(answerToQ1)+' - Answer to question 1 in the homework is the RMSE score of the model where training data is y_predicted and test data is y_target')

#Answer to Q1: Print means to console
print(item_training_matrix_complete['mean'])

#Answer to Q1: Save means to file
item_training_matrix_complete['mean'].to_csv(path_or_buf='./Means for Q1', sep='\t')

2.4587282639512806 - Answer to question 1 in the homework is the RMSE score of the model where training data is y_predicted and test data is y_target
'Til There Was You (1997)                                   2.974359
1-900 (1994)                                                2.986667
101 Dalmatians (1996)                                       2.925676
12 Angry Men (1957)                                         3.698113
187 (1997)                                                  2.960000
2 Days in the Valley (1996)                                 3.109489
20,000 Leagues Under the Sea (1954)                         3.172131
2001: A Space Odyssey (1968)                                3.695652
3 Ninjas: High Noon At Mega Mountain (1998)                 2.920000
39 Steps, The (1935)                                        3.345133
8 1/2 (1963)                                                3.163265
8 Heads in a Duffel Bag (1997)                              2.986486
8 Seconds (1994)      



## Question 3: 
 - RMSE at 0.49602, Tested case of random users was for users: (717 478 770 517 178 926 689 825 232 312)
 - Notice that Q3 is calculated before Q2 in this notebook, due to reusability of the functions from Q3
 - Low RMSE score is parcially due to the capping of ratings to be within 1 and 5, as discussed [here](https://www.researchgate.net/post/How_can_I_deal_with_negative_predictions_in_Collaborative_Filtering_systems)
 - https://www.researchgate.net/post/How_can_I_deal_with_negative_predictions_in_Collaborative_Filtering_systems

In [5]:
def makeNormalizedRatingsMatrix(ratingsMatrix):
    #Matrix of normalize ratings as difference between rating and users mean
    normalized_RatingsMatrix = ratingsMatrix.copy(deep=1)  
    return normalized_RatingsMatrix.sub(normalized_RatingsMatrix.mean(axis=1), axis=0)

def makePredictedRatings(mainUserSeries, trainingMatrix, normalizedMatrix, threshold_value):
    #Make matrix of weighted normalized rating, per pcc with 1 user. Pre-calculated normalized matrix to lower asymptotic runtime
    normalizedRatings = normalizedMatrix.copy(deep=1)
    normalizedRatings['pcc'] = trainingMatrix.corrwith(mainUserSeries, method='pearson', axis=1, drop=True)
    normalizedRatings = normalizedRatings[normalizedRatings.pcc >= threshold_value]
    pcc = normalizedRatings['pcc']
    
    pccSumMatrix = normalizedRatings.apply(lambda x : x.replace(x[x.notnull()],x.pcc), axis=1) 
   
    #Drop the pcc column and multply dataframe with pcc-weights in pcc series.
    normalizedRatings = normalizedRatings.drop(['pcc'], axis=1)
    pccSumMatrix = pccSumMatrix.drop(['pcc'], axis = 1)
    weightedNormalizedRatings = normalizedRatings.multiply(pcc, axis=0)

    #Nominator and denominator of prediction expression in class
    nominator = weightedNormalizedRatings.sum(axis=0, skipna=True)
    denominator = normalizedRatings.count(axis=0) 
    
    #remove 0-values, in order to not add the mean as predicted ratings to all remaining movies
    nominator = nominator[(denominator.T != 0)]
    denominator = denominator[(denominator.T != 0)]
    
    #Normally, wouldn't recommend already seen movies, both from training and test. but we need it too calculate RMSE
    #seenMoviesInTrainingData = trainingMatrix.loc[mainUserSeries.name]
    
    #To be able to calculate RMSE-scores, remove entries that are not in test_data  
    nominator = nominator.drop(mainUserSeries[pd.isnull(mainUserSeries)].index, errors='ignore')
    denominator = denominator.drop(mainUserSeries[pd.isnull(mainUserSeries)].index, errors='ignore')
    
    
    res = nominator.divide(denominator).add(mainUserSeries.mean())
    res = res.apply(lambda x: min(x,5)).apply(lambda x: max(x,1)) #As discussed here, limit values to be within boundaries: https://www.researchgate.net/post/How_can_I_deal_with_negative_predictions_in_Collaborative_Filtering_systems
    res = res.sort_values(axis=0, ascending=False)
    return res


In [6]:
def setThreshold():
    #userId = int(input("Type user_id of the user you want to predict ratings for: ")) #control logic for validating this input is outside of scope 
    while True: 
        num = float(input('Enter number between 0 and 1, with "." for decimal numbers: ')) 
        if 0 <= num <= 1: 
            print('Thanks!') 
            break 
        else: 
            print('number out of range')
    return num

def predictRatingsWrapper(userId, threshold): 
    res = makePredictedRatings(user_test_matrix.loc[userId], user_training_matrix, normalizedMatrix, threshold)
    res = movie_titles_data.copy().merge(res.rename('predicted_score'), left_on='movie_title', right_index=True)
    res = res.merge(user_test_matrix.loc[userId].rename('actualTestScore_ifAvailable'), left_on='movie_title', right_index=True)
    res = res.sort_values(by='predicted_score',axis=0, ascending=0)
    res = res.set_index('item_id')
    return res

def displayRandomUsers(amountOfUsers, headCount):
    threshold = setThreshold()
    predicted = []
    actual = []
    randomUsersIndexes = random.sample(list(user_test_matrix.index), amountOfUsers)
    for user in randomUsersIndexes:
        res = predictRatingsWrapper(user,threshold)
        display(res.head(headCount))
        print('User_id: ' + str(user))
        predicted.extend(res['predicted_score'].to_numpy().tolist())
        actual.extend(res['actualTestScore_ifAvailable'].to_numpy().tolist())
    print('\nRMSE of x amount of users predicted movie ratings: ' + str(RMSE_fromList(predicted, actual)))
    
    
def RMSE_fromList(predicted, actual):
    #Calculate RMSE scores
    sum=0
    for i in range (len(min(predicted,actual))):
        sum+=((actual[i] - predicted[i])**2)
    return sum/len((min(predicted,actual)))
    

#Calculate normalizedMatrix only once 
normalizedMatrix = makeNormalizedRatingsMatrix(user_training_matrix)

#automatically handled errors
np.seterr(divide='ignore', invalid='ignore')
displayRandomUsers(10, 3)
print('''\nHere we can see tendences for test data to contain top scores(5) for many movies in the set of highest predicted movies, 
See user "1" for instance, with <.head(20)> to see more entries. This is a good indication for the performance of the algorithm \n \n'''

'''The choice of threshold affects the scoring in a way where lower similarity thresholds gives a higher rating to "mainstream popular" movies 
and a higher similarity threshold gives a relatively higher score to more specified movies that are popular for the users group of similar people, 
though not as popular with the general public as the mainstream blockbusters''') 



Enter number between 0 and 1, with "." for decimal numbers: 0.8
Thanks!


  c = cov(x, y, rowvar)


Unnamed: 0_level_0,movie_title,release_date,predicted_score,actualTestScore_ifAvailable
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
313,Titanic (1997),01-Jan-1997,4.761182,5.0
50,Star Wars (1977),01-Jan-1977,4.48611,5.0
67,Ace Ventura: Pet Detective (1994),01-Jan-1994,4.276577,5.0


User_id: 394


  c = cov(x, y, rowvar)


Unnamed: 0_level_0,movie_title,release_date,predicted_score,actualTestScore_ifAvailable
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
285,Secrets & Lies (1996),04-Oct-1996,3.000991,4.0
845,That Thing You Do! (1996),28-Sep-1996,2.493635,4.0
278,Bed of Roses (1996),01-Jan-1996,2.296429,3.0


User_id: 837


  c = cov(x, y, rowvar)


Unnamed: 0_level_0,movie_title,release_date,predicted_score,actualTestScore_ifAvailable
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1142,When We Were Kings (1996),14-Feb-1997,4.622082,4.0
88,Sleepless in Seattle (1993),01-Jan-1993,4.59267,5.0
187,"Godfather: Part II, The (1974)",01-Jan-1974,4.553706,5.0


User_id: 298


  c = cov(x, y, rowvar)


Unnamed: 0_level_0,movie_title,release_date,predicted_score,actualTestScore_ifAvailable
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12,"Usual Suspects, The (1995)",14-Aug-1995,5.0,5.0
56,Pulp Fiction (1994),01-Jan-1994,5.0,5.0
210,Indiana Jones and the Last Crusade (1989),01-Jan-1989,4.926279,5.0


User_id: 272


  c = cov(x, y, rowvar)


Unnamed: 0_level_0,movie_title,release_date,predicted_score,actualTestScore_ifAvailable
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
751,Tomorrow Never Dies (1997),01-Jan-1997,3.420214,5.0
245,"Devil's Own, The (1997)",26-Mar-1997,2.299069,4.0
887,Eve's Bayou (1997),01-Jan-1997,2.247006,4.0


User_id: 818


  c = cov(x, y, rowvar)


Unnamed: 0_level_0,movie_title,release_date,predicted_score,actualTestScore_ifAvailable
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
140,Homeward Bound: The Incredible Journey (1993),01-Jan-1993,4.813725,4.0
385,True Lies (1994),01-Jan-1994,4.522227,4.0
88,Sleepless in Seattle (1993),01-Jan-1993,4.160872,4.0


User_id: 505


  c = cov(x, y, rowvar)


Unnamed: 0_level_0,movie_title,release_date,predicted_score,actualTestScore_ifAvailable
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
313,Titanic (1997),01-Jan-1997,5.0,5.0
300,Air Force One (1997),01-Jan-1997,4.941959,5.0
298,Face/Off (1997),27-Jun-1997,4.790941,5.0


User_id: 304


  c = cov(x, y, rowvar)


Unnamed: 0_level_0,movie_title,release_date,predicted_score,actualTestScore_ifAvailable
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
794,It Could Happen to You (1994),01-Jan-1994,4.20304,4.0
974,Eye for an Eye (1996),01-Jan-1996,4.100123,5.0
72,"Mask, The (1994)",01-Jan-1994,4.077953,5.0


User_id: 436


  c = cov(x, y, rowvar)


Unnamed: 0_level_0,movie_title,release_date,predicted_score,actualTestScore_ifAvailable
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
64,"Shawshank Redemption, The (1994)",01-Jan-1994,4.20406,5.0
118,Twister (1996),10-May-1996,4.115938,4.0
79,"Fugitive, The (1993)",01-Jan-1993,3.771813,4.0


User_id: 690


  c = cov(x, y, rowvar)


Unnamed: 0_level_0,movie_title,release_date,predicted_score,actualTestScore_ifAvailable
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
19,Antonia's Line (1995),01-Jan-1995,5.0,5.0
50,Star Wars (1977),01-Jan-1977,5.0,5.0
275,Sense and Sensibility (1995),01-Jan-1995,5.0,5.0


User_id: 701

RMSE of x amount of users predicted movie ratings: 0.5299920636088244

Here we can see tendences for test data to contain top scores(5) for many movies in the set of highest predicted movies, 
See user "1" for instance, with <.head(20)> to see more entries. This is a good indication for the performance of the algorithm 
 
The choice of threshold affects the scoring in a way where lower similarity thresholds gives a higher rating to "mainstream popular" movies 
and a higher similarity threshold gives a relatively higher score to more specified movies that are popular for the users group of similar people, 
though not as popular with the general public as the mainstream blockbusters


## Question 2:
 - RMSE at 0.5598
 - Low RMSE score is parcially due to the capping of ratings to be within 1 and 5, as discussed [here](https://www.researchgate.net/post/How_can_I_deal_with_negative_predictions_in_Collaborative_Filtering_systems)
 - https://www.researchgate.net/post/How_can_I_deal_with_negative_predictions_in_Collaborative_Filtering_systems

In [8]:
#Q2
normalizedMatrixComplete = makeNormalizedRatingsMatrix(user_training_matrix_complete)
#Using same method as in Q3, makePredictedRatings, but here, using the complete training matrix. Complete matrix has value 3 added where values are missing in training data
def predictRatingsAndAddTrainingRating(userId, threshold): 
    #Using item_training_matrix_complete as it has training values for all test values
    predicted = makePredictedRatings(user_test_matrix.loc[userId], user_training_matrix_complete, normalizedMatrixComplete, threshold)
    
    """ In order to calculate mean squared error btween predicted values and mean in the same matrix. this would concatenate predicted values and already rated values in training data.
    Though we can only test our prediction on the data that is also in the test-set, so we won't concatenate this for the assignments tasks
    mergedPredictedAnsRated = pd.concat([predicted,user_training_matrix_complete.loc[userId]], ignore_index=False, join='inner')
    mergedPredictedAnsRated = mergedPredictedAnsRated[mergedPredictedAnsRated.notnull()].drop('mean')
    """

    res = movie_titles_data.merge(predicted.rename('predicted_score'), left_on='movie_title', right_index=True, how='right')
    res = res.drop_duplicates('movie_title') 
    res = res.merge(user_test_matrix.copy().loc[userId].rename('actualTestScore_ifAvailable'), left_on='movie_title', right_index=True)
    res = res.set_index('item_id')
    return res

def calculateAllUsersPredictionsAndPrintRMSE():
    #Function creates predicted scores for all users and then compares actual and predicted scores to calcualte the RMSE value
    threshold = 0.8
    predicted = []
    actual = []
    allUsersIndexes = list(user_test_matrix.index)  
    for user in allUsersIndexes:
        res = predictRatingsAndAddTrainingRating(user,threshold)   
        predicted.extend(res['predicted_score'].to_numpy().tolist())
        actual.extend(res['actualTestScore_ifAvailable'].to_numpy().tolist())
    print('\nQ2: RMSE of the entire "predicted ratings matrix": ' + str(RMSE_fromList(predicted, actual)))

print('Q2 in progress:')
calculateAllUsersPredictionsAndPrintRMSE()

Q2 in progress:


  c = cov(x, y, rowvar)



Q2: RMSE of the entire "predicted ratings matrix": 0.5598326373543332


## Question 4:
 - RMSE at 2.6872
 - Here predicted ratings are not capped to be within 1 and 5

In [7]:
#Q4 
print('Q4 in progress')
#Item based model
minimum_cosine_value=0.8
#For cosine similarity, pandas does not have methods for calculating cosine similarity with dataFrame operations. 
#Here, the cosine similarity matrix is built by iteration over the dataFrame
trainingMatrix = item_training_matrix_complete_num.copy()
cosineSimMatrix=pd.DataFrame(-1.000, index=trainingMatrix.columns, columns=trainingMatrix.columns)
print("Loop: 1/3")
for i in trainingMatrix.columns: 
    for k in trainingMatrix.columns:
        if k > i:
            temp = ~np.logical_or(np.isnan(trainingMatrix[i]), np.isnan(trainingMatrix[k]))
            cos = 1 - spatial.distance.cosine(np.compress(temp, trainingMatrix[i]), np.compress(temp, trainingMatrix[k]))
            if cos>= minimum_cosine_value:
                cosineSimMatrix[i][k] = cos
                cosineSimMatrix[k][i] = cos

#Predict the ratings based on cosine similarity weights
testMatrix = item_test_matrix_num.copy()
predictedItemRatingsMatrix = pd.DataFrame('nan', index=testMatrix.index, columns=testMatrix.columns)
print("Loop: 2/3")
for i in testMatrix.columns:
    for k in testMatrix.index:
        if np.isnan(testMatrix[i][k]) != True:
            if k in trainingMatrix.index:
                item_cosine_correlationMatrix=cosineSimMatrix[i][cosineSimMatrix[i] > 0].sort_values(ascending=False)
                n_item_list = item_cosine_correlationMatrix.index
                sum_of_weightedNormalizedRatings = 0.0
                for j in item_cosine_correlationMatrix.index:
                    if np.isnan(trainingMatrix[j][k]) == False:
                        sum_of_weightedNormalizedRatings += (item_cosine_correlationMatrix[j] * trainingMatrix[j][k])
                        predictedItemRatingsMatrix[i][k] = (1/sum(item_cosine_correlationMatrix))*sum_of_weightedNormalizedRatings

# RMSE calculation
print("Loop: 3/3")
denominator = 0
squaredErrorSum = 0.0
for i in predictedItemRatingsMatrix.columns:
    for k in predictedItemRatingsMatrix.index:
        if predictedItemRatingsMatrix[i][k] != 'nan':
            denominator += 1
            squaredErrorSum += (item_test_matrix_num[i][k] - predictedItemRatingsMatrix[i][k])**2

RMSE = sqrt(squaredErrorSum/denominator)
print('Q4:\nRMSE of the entire predicted ratings matrix, predicted based on item based collaborative filering: ' + str(RMSE))

Q4 in progress
Loop: 1/3


  return getattr(obj, method)(*args, **kwds)
  avg = a.mean(axis)


Loop: 2/3
Loop: 3/3
Q4:
RMSE of the entire predicted ratings matrix, predicted based on item based collaborative filering: 2.687238950294136


In [None]:
#Work in progess attempt to implement fast calculation of Q4
'''
def makeNormalizedItemMatrix(ratingsMatrix):
    #Matrix of normalize ratings as difference between rating and users mean
    normalized_RatingsMatrix = ratingsMatrix.copy(deep=1)  
    return normalized_RatingsMatrix.sub(normalized_RatingsMatrix.mean(axis=0), axis=1).drop('mean', axis=1)



threshold=0.8
normalizedItemMatrix = makeNormalizedItemMatrix(item_training_matrix)



def itemBasedCollaborativeFilter(mainItemSeries, trainingMatrix, normalizedMatrix, threshold):
    #Make matrix of weighted normalized rating, per cosine with 1 user. Pre-calculated normalized matrix to lower asymptotic runtime
    weightedNormalizedRatings = normalizedMatrix.copy(deep=1)
    weightedNormalizedRatings.to_csv(path_or_buf='./test')
    
    a = weightedNormalizedRatings.apply(lambda x : print(1-cosine(x,mainItemSeries)), axis=1)  #x.replace(x[x.notnull()],x.pcc)
    
    
    a = cosine_similarity(weightedNormalizedRatings)
    #print(a)
    
    weightedNormalizedRatings['cos'] = (1-cosine(trainingMatrix,mainItemSeries))
    weightedNormalizedRatings = weightedNormalizedRatings[weightedNormalizedRatings.pcc >= threshold_value]
    cos = weightedNormalizedRatings['cos']
    

    return 0

itemBasedCollaborativeFilter(item_training_matrix_complete.iloc[1],item_training_matrix_complete, normalizedItemMatrix, 0.8)
'''

In [None]:
''' 
Implementation of PCC calculation proved to be unneccecary, as it exists a library function for that functionality
'''
def pearsonCorrelationCoefficient(mainUser, comparingUser):
    #Input is a mainUsers ratings from testData and a comparing users ratings from training data
    #print(mainUser)
    numerator = ratingsMinusMean(mainUser).multiply(ratingsMinusMean(comparingUser))
    numerator = numerator.sum()
    denominator = sqrtOf_sumOf_squareOf_ratingsMinusMean(mainUser)*sqrtOf_sumOf_squareOf_ratingsMinusMean(comparingUser)
    return numerator/denominator

def sqrtOf_sumOf_squareOf_ratingsMinusMean(userRatings):
    #Input is dataframe with a 'mean' field and ratings
    return sqrt(ratingsMinusMean(userRatings).pow(2).sum())

def ratingsMinusMean(userRatings):
    #Input is dataframe with a 'mean' field and ratings
    ratings = userRatings.loc[userRatings != 'mean']
    #print(ratings)
    mean = userRatings.mean()
    return ratings - mean
 

In [8]:

#Handy method for result validation if you would be interested
#write to file to see entire dataset of item_training_matrix_full. Can be used for any dataFrame as wished
item_training_matrix_full.to_csv(path_or_buf='./test', sep='\t')

