In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
import math
from sklearn.metrics import mean_squared_error


In [2]:
rs_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ua.base', sep='\t', names=rs_cols)
ratings_test = pd.read_csv('ua.test', sep='\t', names=rs_cols)

In [3]:
ratings_base.nunique()
#There are 943 unique ids and 1680 unique movie ids are there in the dataset

user_id             943
movie_id           1680
rating                5
unix_timestamp    46638
dtype: int64

In [4]:
row_base_ratings=ratings_base['user_id'].unique().max()
column_base_ratings=ratings_base['movie_id'].unique().max()

row_test_ratings=ratings_test['user_id'].unique().max()
column_test_ratings=ratings_test['movie_id'].unique().max()

In [5]:
train_matrix=np.zeros((row_base_ratings,column_base_ratings))
for line in ratings_base.itertuples():
    train_matrix[line[1]-1,line[2]-1]=line[3]
    #print(line)
    
test_matrix=np.zeros((row_test_ratings,column_test_ratings))
for line in ratings_test.itertuples():
    test_matrix[line[1]-1,line[2]-1]=line[3]
    #print(line)
    

# User-User based Collaberative Filtering

<b>dataframe.iteritems()--------->It gives the columns as keys and the respective values in a series
    dataframe.iterrows()----------> It gives an iterator yielding each row index along withh all the row values
    dataframe.itertuples()--------> It gives named series where the first element is the index value and the rest are the row values

We use the cosine similarity metric which computers the dot product between the two vectors made up of the ratings of the 
movies they have rated.

In [6]:
user_similarity=pairwise_distances(train_matrix,metric='cosine')
user_similarity.shape
user_similarity

array([[0.        , 0.85324924, 0.9493235 , ..., 0.96129522, 0.8272823 ,
        0.61960392],
       [0.85324924, 0.        , 0.87419215, ..., 0.82629308, 0.82681535,
        0.91905667],
       [0.9493235 , 0.87419215, 0.        , ..., 0.97201154, 0.87518372,
        0.97030738],
       ...,
       [0.96129522, 0.82629308, 0.97201154, ..., 0.        , 0.96004871,
        0.98085615],
       [0.8272823 , 0.82681535, 0.87518372, ..., 0.96004871, 0.        ,
        0.85528944],
       [0.61960392, 0.91905667, 0.97030738, ..., 0.98085615, 0.85528944,
        0.        ]])

In [7]:
#The trainning matrix is given as input and the used similarity matix and the number of expected smilar movies required passed
def UserVsUser(train_matrix,user_similarity,n_similar=4):
    #the user_similarity matrix is being sorted and the indexes are returned.'-n_similar'  gives the last  30 values
    #the last index gives in the reverse order. The maximum 30 cosine similarities users
    similar_n=user_similarity.argsort()[:,-n_similar:][:,::-1]
    #print(similar_n)
    pred=np.zeros((row_base_ratings,column_base_ratings))
    
    for i,users in enumerate(similar_n):
        #The indexes of the maximum similatiey uses are stored here
        similar_users_indexes=users
        #print(i,users)
        #The cosine similarities values are restored here by giving the column values
        similarity_n=user_similarity[i,similar_users_indexes]
        #print(i,similarity_n,'***************************')
        #The ratings given by the similar users are filtered out from the matrix_n
        matrix_n=train_matrix[similar_users_indexes,:]
        #print(similar_users_indexes)
        #The first term adds one more dimension and gives a column matrix.Its transpose will be a row matrix
        #a single entry in the rated_items is the dot product between the all similar uses consine value and all the ratings 
        #of one item
        rated_items=similarity_n[:,np.newaxis].T.dot(matrix_n-matrix_n.mean(axis=1)[:,np.newaxis])/similarity_n.sum()
        pred[i,:]=rated_items
        #print(pred)
    #print((matrix_n-matrix_n.mean(axis=1)[:,np.newaxis])/similarity_n.sum())
    #print(rated_items)
    return pred
        

In [8]:
predictions=UserVsUser(train_matrix,user_similarity,50)+train_matrix.mean(axis=1)[:,np.newaxis]
#user_similarity[0,0]
predictions

array([[ 0.53079191,  0.53079191,  0.53079191, ...,  0.53079191,
         0.53079191,  0.53079191],
       [ 0.27556554,  0.17581381, -0.00189689, ..., -0.00189689,
        -0.00189689, -0.00189689],
       [ 1.17064209,  0.07064209,  0.01064209, ...,  0.01064209,
         0.01064209,  0.01064209],
       ...,
       [-0.0479786 , -0.0479786 , -0.0479786 , ..., -0.0479786 ,
        -0.0479786 , -0.0479786 ],
       [ 0.8909642 ,  0.12995357,  0.12995357, ...,  0.12995357,
         0.12995357,  0.12995357],
       [ 0.27315101,  0.27315101,  0.27315101, ...,  0.31315101,
         0.27315101,  0.27315101]])

In [9]:
predicted_ratings = predictions[test_matrix.nonzero()]

test_truth = test_matrix[test_matrix.nonzero()]

math.sqrt(mean_squared_error(predicted_ratings,test_truth))


3.507744099069281

# item-item based collaberative filtering

In [10]:
item_similarity=pairwise_distances(train_matrix.T,metric='cosine')

In [11]:
item_similarity
item_similarity.shape

(1682, 1682)

In [12]:
def itemVsitem(train_matrix,item_similarity,n_similar=30):
    similar_n=item_similarity.argsort()[:,-n_similar:][:,::-1]
    #print(similar_n.shape)
    pred=np.zeros((row_base_ratings,column_base_ratings))
    
    for i,items in enumerate(similar_n):
        similar_items_indexes=items
        similarity_n=item_similarity[i,similar_items_indexes]
        matrix_n=train_matrix[:,similar_items_indexes]
        rated_items=matrix_n.dot(similarity_n)/similarity_n.sum()
        pred[:,i]=rated_items
    return pred

In [13]:
itemVsitem(train_matrix,item_similarity,5)

array([[0. , 0. , 0. , ..., 0. , 0. , 1.6],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [14]:
predictions = itemVsitem(train_matrix,item_similarity,50)
print('predictions shape ',predictions.shape)

predicted_ratings = predictions[test_matrix.nonzero()]
test_truth = test_matrix[test_matrix.nonzero()]

math.sqrt(mean_squared_error(predicted_ratings,test_truth))

predictions shape  (943, 1682)


3.749688827167227

In [15]:

user_id=1
user_ratings=predictions[user_id-1,:]


train_unknown_indices=np.where(train_matrix[user_id-1,:]==0)[0]

In [16]:
user_recommendations=user_ratings[train_unknown_indices]
user_recommendations.shape

(1420,)

In [17]:
movie_reco=user_recommendations.argsort()[-10:][::-1]

movies_list=pd.read_csv("movies.csv")
movies_list1=movies_list[['movieId','title']]

for movie_idd in movie_reco:
    print(movies_list1.iloc[movie_idd,1])


Day the Earth Stood Still, The (1951)
Eighth Day, The (Huitième jour, Le) (1996)
Crucible, The (1996)
Love Bug, The (1969)
For the Moment (1994)
8 1/2 (8½) (1963)
Better Off Dead... (1985)
Shining, The (1980)
Stand by Me (1986)
Deer Hunter, The (1978)


In [18]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

u,s,vt=svds(train_matrix,k=12)

In [19]:
u.shape,s.shape,vt.shape
s_diag_matrix=np.diag(s)

In [20]:
predictions_svd=np.dot(np.dot(u,s_diag_matrix),vt)

In [21]:
predicted_ratings_svd=predictions_svd[test_matrix.nonzero()]
test_truth=test_matrix[test_matrix.nonzero()]

In [22]:
math.sqrt(mean_squared_error(predicted_ratings_svd,test_truth))

2.8346302017438285

In [24]:




user_id=100
user_ratings1=predictions_svd[user_id-1,:]


#train_unknown_indices=np.where(train_matrix[user_id-1,:]==0)[0]

user_recommendations1=user_ratings1[train_unknown_indices]

movie_reco1=user_recommendations1.argsort()[-10:][::-1]

#movies_list=pd.read_csv("movies.csv")
#movies_list1=movies_list[['movieId','title']]

for movie_idd in movie_reco1:
    print(movies_list1.iloc[movie_idd,1])

It Takes Two (1995)
Guardian Angel (1994)
Cry, the Beloved Country (1995)
Lawnmower Man 2: Beyond Cyberspace (1996)
Powder (1995)
Lamerica (1994)
To Die For (1995)
Fair Game (1995)
Two if by Sea (1996)
Big Green, The (1995)
