Summary
===============
Our project is to give the users recommendations based on the similar users. We used collaborative filtering, Matrix Factorization technique to achieve this.

Dataset
============
We used dataset of movie ratings provided by movielens, which contains movie id, user id, ratings and timestamp.

**Name** : ml-latest-small 


**Link** : https://grouplens.org/datasets/movielens/

Technique Used
==================
we used matrix factorization technique by svd.

Observations
==================
We achieved root mean square error of 1.0018 when we take latent features(k) as 5 for svd. 



In [None]:
import pandas as pd
import numpy as np
import scipy

data = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
movieDict = {}

#preparing movie dictionary to recommend the titles to users at last.
for _,row in movies.iterrows():
    movieDict[str(row['movieId'])] = row['title']

data['userId'] = data['userId'].astype('str')
data['movieId'] = data['movieId'].astype('str')

users = data['userId'].unique() #list of all users
movies = data['movieId'].unique() #list of all movies

print(data.head())

  userId movieId  rating  timestamp
0      1       1     4.0  964982703
1      1       3     4.0  964981247
2      1       6     4.0  964982224
3      1      47     5.0  964983815
4      1      50     5.0  964982931


Dividing train and test data by 80:20 

In [None]:
test = pd.DataFrame(columns=data.columns)
train = pd.DataFrame(columns=data.columns)
temp = pd.DataFrame(columns=data.columns)
test_ratio = 0.2 #fraction of data to be used as test set.

for u in users:
    temp = data[data['userId'] == u]
    n = len(temp)
    test_size = int(test_ratio*n)

    temp = temp.sort_values('timestamp').reset_index()
    temp.drop('index', axis=1, inplace=True)
    
    dummy_test = temp.iloc[n-1-test_size :]
    dummy_train = temp.iloc[: n-2-test_size]
    
    test = pd.concat([test, dummy_test])
    train = pd.concat([train, dummy_train])

print(train)

     userId movieId  rating   timestamp
0         1    1210     5.0   964980499
1         1     804     4.0   964980499
2         1    2018     5.0   964980523
3         1    2628     4.0   964980523
4         1    2826     4.0   964980523
...     ...     ...     ...         ...
1035    610   60471     3.5  1493848660
1036    610   27778     3.0  1493848667
1037    610   55067     3.5  1493848671
1038    610  103219     3.5  1493848674
1039    610   51666     2.0  1493848680

[79676 rows x 4 columns]


Converting the ratings data into matrix format

In [None]:
import numpy as np
import pandas as pd
from scipy.linalg import sqrtm
def create_utility_matrix(data, formatizer = {'user':0, 'item': 1, 'value': 2}):
    """
        :param data:      Array-like, 2D, nx3
        :param formatizer:pass the formatizer
        :return:          utility matrix (n x m), n=users, m=items
    """
        
    itemField = formatizer['item']
    userField = formatizer['user']
    valueField = formatizer['value']
    userList = data.iloc[:,userField].tolist()
    itemList = data.iloc[:,itemField].tolist()
    valueList = data.iloc[:,valueField].tolist()
    users = list(set(data.iloc[:,userField]))
    items = list(set(data.iloc[:,itemField]))
    users_index = {users[i]: i for i in range(len(users))}
    pd_dict = {item: [np.nan for i in range(len(users))] for item in items}
    for i in range(0,len(data)):
        item = itemList[i]
        user = userList[i]
        value = valueList[i]
        pd_dict[item][users_index[user]] = value
    X = pd.DataFrame(pd_dict)
    X.index = users
        
    itemcols = list(X.columns)
    items_index = {itemcols[i]: i for i in range(len(itemcols))}
    # users_index gives us a mapping of user_id to index of user
    # items_index provides the same for items
    return X, users_index, items_index

Here we get predicted matrix by using svd model

In [None]:
def svd(train, k):
    utilMat = np.array(train)
    
    # the nan or unavailable entries are masked
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)
    item_means = np.mean(masked_arr, axis=0)

    # nan entries will replaced by the average rating for each item
    utilMat = masked_arr.filled(item_means)
    x = np.tile(item_means, (utilMat.shape[0],1))

    # we remove the per item average from all entries.
    # the above mentioned nan entries will be essentially zero now
    utilMat = utilMat - x
    # U and V are user and item features
    U, s, V=np.linalg.svd(utilMat, full_matrices=False)
    s=np.diag(s)
    # K latent features
    s=s[0:k,0:k]
    U=U[:,0:k]
    V=V[0:k,:]
    s_root=sqrtm(s)
    Usk=np.dot(U,s_root)
    skV=np.dot(s_root,V)
    UsV = np.dot(Usk, skV)
    UsV = UsV + x
    print("svd done")
    return UsV

Calculating root mean square error for our model

In [None]:
import math
import numpy.ma as ma

def rmse(true, pred):
    # this will be used towards the end
    x = true - pred
    return math.sqrt(sum([xi*xi for xi in x])/len(x))

# to test the performance over a different number of features
no_of_features = [3,5,8,10,15,20,50,75,100]
utilMat, users_index, items_index = create_utility_matrix(train)
user_pred = {}

for f in no_of_features: 
    svdout = svd(utilMat, k=f)
    pred = [] #to store the predicted ratings
    user_pred.clear()
    for _,row in test.iterrows():
        user = row['userId']
        item = row['movieId']
        u_index = users_index[user]
        if item in items_index:
            i_index = items_index[item]
            pred_rating = svdout[u_index, i_index]
        else:
            pred_rating = np.mean(svdout[u_index, :])
            user_pred.setdefault(user,[]).append([pred_rating,item])
        pred.append(pred_rating)
    print(rmse(test['rating'], pred))

item_dict = dict((v,k) for k,v in items_index.items())
#updating user predictions of all movies in a list with predicted ratings
for user in users_index.keys():
    u_index = users_index[user]
    u_ratings = ma.getdata(svdout[u_index, :])
    for i in range(len(u_ratings)):
        user_pred.setdefault(user, []).append([u_ratings[i],item_dict[i]])

svd done
1.0025296483722526
svd done
1.0018187399945997
svd done
1.0021567262012732
svd done
1.0020101514806032
svd done
1.0030428726643206
svd done
1.0030900965801048
svd done
1.0061892173848752
svd done
1.008353047973624
svd done
1.0102065998882652


List of movies recommended for particular user based on predicted ratings

In [None]:
def recommender(user):
    counter = 0

    #sorting movies of user based on ratings
    if user in user_pred:
        user_pred[user].sort(key = lambda x: x[0],reverse=True)

    print("Recommendations for user : ", user)
    #displaying top 100 predicted ratings of a particular user
    for pred,item in user_pred[user]:
        print(item, movieDict[item])
        counter = counter+1
        if counter>100:
            break

recommender('2')

Recommendations for user :  2
172705 Tickling Giants (2017)
6021 American Friend, The (Amerikanische Freund, Der) (1977)
158398 World of Glory (1991)
1310 Hype! (1996)
40412 Dead Man's Shoes (2004)
107771 Only Lovers Left Alive (2013)
5745 Four Seasons, The (1981)
6983 Jane Eyre (1944)
3214 American Flyers (1985)
50804 Hannibal Rising (2007)
131130 Tom and Jerry: A Nutcracker Tale (2007)
27320 Nine Lives of Tomas Katz, The (2000)
174053 Black Mirror: White Christmas (2014)
6442 Belle époque (1992)
496 What Happened Was... (1994)
3795 Five Senses, The (1999)
3678 Man with the Golden Arm, The (1955)
121781 Stuart Little 3: Call of the Wild (2005)
124404 Snowflake, the White Gorilla (2011)
26350 Passenger, The (Professione: reporter) (1975)
48698 Deliver Us from Evil (2006)
7122 King of Hearts (1966)
132153 Buzzard (2015)
86721 Idiots and Angels (2008)
136556 Kung Fu Panda: Secrets of the Masters (2011)
112512 Colourful (Karafuru) (2010)
6945 My Architect: A Son's Journey (2003)
3851 I'm 