In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('movies.csv')
links = pd.read_csv('links.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9125 entries, 0 to 9124
Data columns (total 3 columns):
movieId    9125 non-null int64
title      9125 non-null object
genres     9125 non-null object
dtypes: int64(1), object(2)
memory usage: 213.9+ KB


In [5]:
movies.describe()

Unnamed: 0,movieId
count,9125.0
mean,31123.291836
std,40782.633604
min,1.0
25%,2850.0
50%,6290.0
75%,56274.0
max,164979.0


In [6]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [7]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9125 entries, 0 to 9124
Data columns (total 3 columns):
movieId    9125 non-null int64
imdbId     9125 non-null int64
tmdbId     9112 non-null float64
dtypes: float64(1), int64(2)
memory usage: 213.9 KB


In [8]:
links.describe()

Unnamed: 0,movieId,imdbId,tmdbId
count,9125.0,9125.0,9112.0
mean,31123.291836,479824.4,39104.545544
std,40782.633604,743177.4,62814.519801
min,1.0,417.0,2.0
25%,2850.0,88846.0,9451.75
50%,6290.0,119778.0,15852.0
75%,56274.0,428441.0,39160.5
max,164979.0,5794766.0,416437.0


In [9]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [10]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
userId       100004 non-null int64
movieId      100004 non-null int64
rating       100004 non-null float64
timestamp    100004 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [11]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100004.0,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608,1129639000.0
std,195.163838,26369.198969,1.058064,191685800.0
min,1.0,1.0,0.5,789652000.0
25%,182.0,1028.0,3.0,965847800.0
50%,367.0,2406.5,4.0,1110422000.0
75%,520.0,5418.0,4.0,1296192000.0
max,671.0,163949.0,5.0,1476641000.0


In [12]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1138537770
1,15,1955,dentist,1193435061
2,15,7478,Cambodia,1170560997
3,15,32892,Russian,1170626366
4,15,34162,forgettable,1141391765


In [13]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296 entries, 0 to 1295
Data columns (total 4 columns):
userId       1296 non-null int64
movieId      1296 non-null int64
tag          1296 non-null object
timestamp    1296 non-null int64
dtypes: int64(3), object(1)
memory usage: 40.6+ KB


In [14]:
tags.describe()

Unnamed: 0,userId,movieId,timestamp
count,1296.0,1296.0,1296.0
mean,417.026235,42278.949846,1324337000.0
std,142.18344,44628.345568,109388600.0
min,15.0,1.0,1137217000.0
25%,346.0,2988.0,1243455000.0
50%,431.0,26958.5,1342849000.0
75%,547.0,72268.25,1440380000.0
max,663.0,164979.0,1476651000.0


#### Here, we are loading the various data files in the movie lens dataset. We load each csv file in respective pandas dataframes. We have then analysed the structure of each csv file. This showed us that for our purpose of making a basic recommendation system, we require only two of the files, movies.csv and ratings.csv . The movies.csv will provide us with the metadata of movies, and ratings.csv will provide us the movies a user has rated and the respective ratings. 

In [15]:
# This method returns the sparsity of matrix
def getSparsity(mat):
    density = float(len(mat.nonzero()[0]))/(mat.shape[0] * mat.shape[1])
    sparsity = 1-density
    sparsity *= 100
    print('Sparsity: {:4.2f}%'.format(sparsity))

In [16]:
from sklearn.model_selection import train_test_split

df = ratings.drop('timestamp', axis=1)
x = df[['userId', 'movieId']].values
y = df['rating'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

train_data = pd.DataFrame(x_train, columns=['userId', 'movieId'])
train_data['rating'] = y_train
train_data.head()

Unnamed: 0,userId,movieId,rating
0,468,4144,3.5
1,505,4321,4.0
2,86,446,3.0
3,285,1689,2.0
4,30,2259,3.0


In [17]:
print(len(x_train), len(y_train), len(x_test), len(y_test))

70002 70002 30002 30002


#### We now divide the data in ratings.csv into training and testing sets. The training set (70% of the data), is used to generate all the matrices required for predict ratings and making ratings on the testing set (30% of the data). 

<br><br>
# CONTENT-BASED FILTERING

In [18]:
genre_set = set()
for genre in movies['genres'].unique():
    genre_set |= set(map(str, genre.split('|')))
genre_set = sorted(genre_set)
print(genre_set)

['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


#### We use the metadata of the movies in movies.csv to get a set of all the genres that a movie can have. We found 20 of them.

In [19]:
# Method to get a movie-genre vector
def getMGvector(genres):
    v = []
    genres = genres.split('|')
    for genre in genre_set:
        if genre in genres:
            v.append(1)
        else:
            v.append(0)
    return v

# Method to create a movie-genre matrix
def getMGMatrix(df):
    mat = []
    for index, row in df.iterrows():
        temp = [row['movieId']] + getMGvector(row['genres'])
        mat.append(temp)
    cols = ['movieId'] + genre_set
    mat = pd.DataFrame(mat, columns=cols)
    mat.set_index('movieId', inplace=True)
    return mat

In [20]:
mgm = getMGMatrix(movies)
mgm.head()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
mgm.shape

(9125, 20)

#### So we get the above movie-genre matrix, which has 9125 movies and 20 genres

In [22]:
# Method to get the user-genre vector
def getUGvector(df, mgmat):
    mat = []
    for index, row in df.iterrows():
        vec = list(mgmat.loc[row['movieId'], :])
        vec = [row['rating']*i for i in vec]
        mat.append(vec)
    n = len(mat)
    ugvec = []
    for i in range(20):
        gvalues = [mat[j][i] for j in range(n)]
        avg = round(sum(gvalues)/(n*5), 3)
        ugvec.append(avg)
#     print(ugvec)
    return ugvec

# Method to get the user-genre matrix
def getUGMatrix(df, mgmat):
    mat = []
    for user in df['userId'].unique():
        df1 = df.loc[df['userId']==user , ['movieId', 'rating']]
        df2 = mgmat.loc[df1['movieId'].values, :]
        temp = [user] + getUGvector(df1, df2)
        mat.append(temp)
    cols = ['userId'] + genre_set
    mat = pd.DataFrame(mat, columns=cols)
#     print(mat)
    mat.set_index('userId', inplace=True)
    return mat

In [23]:
ugm = getUGMatrix(train_data, mgm)
ugm.head(10)

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
468,0.0,0.092,0.101,0.035,0.045,0.208,0.087,0.059,0.302,0.052,0.015,0.032,0.008,0.03,0.038,0.111,0.062,0.104,0.039,0.012
505,0.0,0.09,0.109,0.03,0.041,0.248,0.083,0.005,0.381,0.042,0.02,0.01,0.004,0.072,0.067,0.222,0.051,0.137,0.053,0.014
86,0.0,0.05,0.055,0.047,0.062,0.239,0.088,0.02,0.448,0.045,0.0,0.022,0.006,0.052,0.033,0.17,0.061,0.07,0.028,0.009
285,0.0,0.199,0.176,0.053,0.087,0.327,0.106,0.0,0.164,0.115,0.001,0.082,0.002,0.04,0.048,0.061,0.133,0.147,0.033,0.014
30,0.0,0.12,0.069,0.016,0.032,0.264,0.14,0.004,0.446,0.035,0.009,0.039,0.001,0.027,0.068,0.141,0.057,0.2,0.039,0.017
407,0.0,0.082,0.084,0.021,0.026,0.243,0.124,0.028,0.59,0.055,0.007,0.025,0.015,0.018,0.063,0.222,0.041,0.14,0.031,0.005
26,0.0,0.319,0.166,0.06,0.053,0.265,0.31,0.0,0.333,0.047,0.006,0.015,0.046,0.0,0.094,0.049,0.112,0.283,0.037,0.012
42,0.0,0.534,0.448,0.059,0.059,0.088,0.173,0.0,0.246,0.155,0.0,0.023,0.179,0.014,0.059,0.07,0.32,0.309,0.061,0.0
4,0.0,0.25,0.25,0.107,0.172,0.376,0.146,0.011,0.224,0.149,0.007,0.084,0.007,0.097,0.034,0.126,0.2,0.164,0.019,0.004
564,0.0,0.111,0.106,0.036,0.066,0.266,0.072,0.014,0.305,0.062,0.008,0.11,0.001,0.035,0.056,0.117,0.081,0.172,0.018,0.008


In [24]:
ugm.shape

(671, 20)

#### We get the above user-genre matrix of 671 users and 20 genres. Each cell represent the likability of a genre for the respective user. The values lies between 0 and 1.

In [25]:
# Method to predict the rating using content-based filtering
def predictCB(userid, movieid):
    ugv = ugm.loc[userid, :].values
    mgv = mgm.loc[movieid, :].values
    rating = np.dot(ugv, mgv)
    rating = round(rating, 2)
    return rating

In [26]:
pred_ratings = []
for row in x_test:
    pred_ratings.append(predictCB(row[0], row[1]))
# print(pred_ratings)

In [27]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

print('RMSE:', mean_squared_error(y_test, pred_ratings))
print('MAE:', mean_absolute_error(y_test, pred_ratings))

RMSE: 10.138530904606359
MAE: 3.01167788814079


In [28]:
# Method to get recommendations using content-based filtering
def getRecommendationsCB(userid, ugm, mgm):
    mat = []
    ugv = ugm.loc[userid, :].values
    for movieid in mgm.index.values:
        mgv = mgm.loc[movieid, :].values
        score = round(np.dot(ugv,mgv), 3)
        mat.append([movieid, score])
    mat = pd.DataFrame(mat, columns=['movieId', 'score'])
    return mat

In [29]:
scores = getRecommendationsCB(45, ugm, mgm)
scores.sort_values(by=['score'], ascending=False, inplace=True)
movies_recom_cb = []
for index, row in scores.head(20).iterrows():
    mid = int(row['movieId'])
    mname = movies.loc[movies['movieId']==mid, 'title'].values[0]
    movies_recom_cb.append(mname)
    score = row['score']
    print('{}\t\t{}'.format(mid, mname))

4956		Stunt Man, The (1980)
81132		Rubber (2010)
4719		Osmosis Jones (2001)
3893		Nurse Betty (2000)
1912		Out of Sight (1998)
5018		Motorama (1991)
587		Ghost (1990)
26093		Wonderful World of the Brothers Grimm, The (1962)
496		What Happened Was... (1994)
5666		Rules of Attraction, The (2002)
970		Beat the Devil (1953)
1907		Mulan (1998)
6902		Interstate 60 (2002)
459		Getaway, The (1994)
45672		Click (2006)
31921		Seven-Per-Cent Solution, The (1976)
7835		Song of the Thin Man (1947)
42015		Casanova (2005)
74438		Legend of the Red Dragon (a.k.a. New Legend of Shaolin, The) (Hong Xi Guan: Zhi Shao Lin wu zu) (1994)
61071		Sisterhood of the Traveling Pants 2, The (2008)


#### Thus we get the above 20 movied ids and movie names which will be most suitable for user with userid 45 based on the content-based filtering method.
<br>
<br>

# COLLABORATIVE FILTERING

In [30]:
# Method to get the user-movie matrix.
def getUMmatrix(ratings):
    movies = np.sort(ratings['movieId'].unique())
    users = np.sort(ratings['userId'].unique())
    mat = []
    for user in users:
#         print('processing user {} out of {}.'.format(user, len(users)))
        uratings = np.zeros(len(movies))
        temp = ratings.loc[ratings.userId==user]
        for mid in temp.movieId.values:
            uratings[np.where(movies==mid)] = temp.loc[temp.movieId==mid, 'rating']
        row = [user] + list(uratings)
#         print(len(uratings), len(row))
        mat.append(row)
    cols = ['userId'] + list(map(lambda x: str(x), movies))
#     print(len(mat[0]), len(cols))
#     print(mat[0])
#     print(cols)
    mat = pd.DataFrame(mat, columns=cols)
    mat.set_index('userId', inplace=True)
    return mat

In [31]:
umm = getUMmatrix(train_data)
umm.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,160718,161084,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
getSparsity(umm.values)
print(umm.shape)

Sparsity: 98.70%
(671, 7997)


#### Here we have the user-movie matrix where each represents the ratings given by a user to movie, and 0 for unrated.

In [33]:
# Method to get the cosine similarity of two vectors
def getCosineSimilarity(v1, v2):
    cs = np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))
    return round(cs, 5)

# Method to get the user similarity matrix
def getSM(df):
    row = np.sort(df.index.values)
    col = list(map(lambda x: str(x), row))
    n = len(row)
    mat = np.zeros((n, n))
    for i in range(n):
        mat[i][i] = 1.0
        for j in range(i):
            if i%1000==0 and j%1000==0:
                print('processing for {}th and {}th entity in {} entities'.format(i, j, n))
            mat[i][j] = getCosineSimilarity(df.loc[row[i], :], df.loc[row[j], :])
            mat[j][i] = mat[i][j]
    mat = pd.DataFrame(mat, index=row, columns=col)
    return mat

In [34]:
usm = getSM(ugm)
usm.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
1,1.0,0.81263,0.80631,0.81371,0.71652,0.90808,0.89103,0.76165,0.81144,0.72221,...,0.82144,0.7763,0.73841,0.81048,0.80692,0.79019,0.7514,0.7174,0.76485,0.84589
2,0.81263,1.0,0.94875,0.88316,0.92893,0.90163,0.92597,0.92486,0.96448,0.83179,...,0.97899,0.84672,0.81004,0.97875,0.93142,0.97644,0.89746,0.84975,0.87612,0.96096
3,0.80631,0.94875,1.0,0.89177,0.87913,0.92309,0.92943,0.95211,0.93546,0.86945,...,0.94577,0.88564,0.88643,0.93687,0.94501,0.96075,0.93462,0.91878,0.90962,0.93147
4,0.81371,0.88316,0.89177,1.0,0.85838,0.93146,0.97316,0.81753,0.81411,0.85807,...,0.92535,0.96253,0.84141,0.93127,0.91358,0.83642,0.79227,0.88386,0.84597,0.94165
5,0.71652,0.92893,0.87913,0.85838,1.0,0.81351,0.89277,0.80065,0.89371,0.68021,...,0.92303,0.85308,0.63522,0.9636,0.89513,0.88467,0.85181,0.79691,0.73139,0.92781


In [35]:
getSparsity(usm.values)
print(usm.shape)

Sparsity: 0.00%
(671, 671)


#### We have the user similarity matrix where each is the cosine similarity measures of user-genre vectors of respective users.

In [36]:
# Method to predict rating on the basis of user-user similarities
def predictRatingUB(userid, movieid):
    similarity_vector = usm.loc[:, str(userid)]
    if str(movieid) not in umm.columns:
        return 0
    rating_vector = umm.loc[:, str(movieid)]
    rating = np.dot(similarity_vector, rating_vector)/(np.sum(np.abs(similarity_vector)))
    return round(rating*5, 2)

In [37]:
# Method to get the recommendations using collaborative filtering
def getRecommendationsCF(userid):
    mat = []
    for movieid in mgm.index.values:
        score = predictRatingUB(userid, movieid)
        mat.append([movieid, score])
    mat = pd.DataFrame(mat, columns=['movieId', 'score'])
    return mat

In [38]:
scores = getRecommendationsCF(45)
scores.sort_values(by=['score'], ascending=False, inplace=True)
movies_recom_cf = []
for index, row in scores.head(20).iterrows():
    mid = int(row['movieId'])
    mname = movies.loc[movies['movieId']==mid, 'title'].values[0]
    movies_recom_cf.append(mname)
    score = row['score']
    print('{}\t\t{}'.format(mid, mname))

296		Pulp Fiction (1994)
356		Forrest Gump (1994)
318		Shawshank Redemption, The (1994)
260		Star Wars: Episode IV - A New Hope (1977)
593		Silence of the Lambs, The (1991)
527		Schindler's List (1993)
608		Fargo (1996)
2571		Matrix, The (1999)
2858		American Beauty (1999)
858		Godfather, The (1972)
1196		Star Wars: Episode V - The Empire Strikes Back (1980)
50		Usual Suspects, The (1995)
1198		Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
480		Jurassic Park (1993)
1		Toy Story (1995)
1270		Back to the Future (1985)
589		Terminator 2: Judgment Day (1991)
457		Fugitive, The (1993)
110		Braveheart (1995)
47		Seven (a.k.a. Se7en) (1995)


#### Thus we have the above list of movies as recommendation using collaborative filtering for all users.

In [39]:
# K-means clustering algorithms.

def vectorAvg(a):
    center=[0 for i in range(len(a[0]))]
    for vec in a:
        center=list(map(sum,zip(center,vec)))
    n=len(a)
    center[:]=[round(x/n,3) for x in center]
    return center
def distInitknn(mat,n,k):
    for i in range(n):
        mat.append([0 for j in range(k)])
    return None
def kMean(vectors,sim,centroids,k=2,prev=[],n=0):
    print("\niteration: ",n,"\n")
    cluster=[]
    distMat=[]
    ids=list(sim.columns)
    #print("\ncentroid vectors:\n")
    #pprint(centroids)
    #print("\ndoc vectors:\n")
    #pprint(docvectors)
    
    #veclen=len(docvectors)
    #distInitknn(distMat,veclen,k)
    #genDistMat(distMat,docvectors,centroids)
    simMat=sim.values
    for i in range(k):
        cluster.append([ids[i]])
    if prev==[]:
        for i in range(k,len(ids)):
            cluster[i%k].append(ids[i])
    else:
        for i in range(k,len(ids)):
            cluster[getMinIndex(i,simMat)].append(ids[i])
    newcentroid=[]
    print("cluster formed:\n",cluster)
    
    for clus in cluster:
        vec=[vectors[ids.index(i)] for i in clus]
        center=vectorAvg(vec)
        newcentroid.append(list(center))
    #print("\nnew centroids:\n")
    #pprint(newcentroid)
    #print("*"*50)
    if cluster!=prev:
        kMean(vectors,sim,newcentroid,k,cluster,n+1)
    return cluster
def calcManDist(a,b):
    dist=0
    for x,y in zip(a,b):
        dist+=abs(x-y)
    return dist
def genDistMat(distMat,docvectors,centroids):
    for i in range(len(docvectors)):
        for j in range(len(centroids)):
            dist=calcManDist(docvectors[i],centroids[j])
            distMat[i][j]=dist
    return None
def getMinIndex(i,mat):
    mindist=float("inf")
    minindex=0
    for j in range(len(mat[i])):
        if mindist>(1-mat[i][j]):
            mindist=mat[i][j]
            minindex=j
    return minindex

In [40]:
k = 4
clusters = kMean(umm.values, usm, umm.values[:k], k)
print(clusters)


iteration:  0 

cluster formed:
 [['1', '5', '9', '13', '17', '21', '25', '29', '33', '37', '41', '45', '49', '53', '57', '61', '65', '69', '73', '77', '81', '85', '89', '93', '97', '101', '105', '109', '113', '117', '121', '125', '129', '133', '137', '141', '145', '149', '153', '157', '161', '165', '169', '173', '177', '181', '185', '189', '193', '197', '201', '205', '209', '213', '217', '221', '225', '229', '233', '237', '241', '245', '249', '253', '257', '261', '265', '269', '273', '277', '281', '285', '289', '293', '297', '301', '305', '309', '313', '317', '321', '325', '329', '333', '337', '341', '345', '349', '353', '357', '361', '365', '369', '373', '377', '381', '385', '389', '393', '397', '401', '405', '409', '413', '417', '421', '425', '429', '433', '437', '441', '445', '449', '453', '457', '461', '465', '469', '473', '477', '481', '485', '489', '493', '497', '501', '505', '509', '513', '517', '521', '525', '529', '533', '537', '541', '545', '549', '553', '557', '561', '565'

IndexError: list index out of range

In [41]:
# Implementing K-means clustering algorithms for creating clusters of similar users.

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=10, n_init=20, random_state=0)
clusters = kmeans.fit_predict(umm.values)

user_clusters = pd.DataFrame(clusters, index=umm.index, columns=['clusterId'])
print(user_clusters)

        clusterId
userId           
1               9
2               9
3               9
4               4
5               9
6               9
7               9
8               4
9               9
10              9
11              9
12              9
13              9
14              9
15              0
16              9
17              4
18              9
19              7
20              9
21              9
22              4
23              3
24              9
25              9
26              4
27              9
28              9
29              9
30              7
...           ...
642             9
643             9
644             9
645             9
646             9
647             9
648             4
649             9
650             9
651             9
652             9
653             9
654             0
655             9
656             9
657             9
658             9
659             9
660             9
661             9
662             9
663             9
664       

In [42]:
# Method to predict rating by CF on user clusters.
def predictCF(userid, movieid, user_clusters):
    cid = user_clusters.loc[userid, 'clusterId']
#     print(cid)
    sim_users = user_clusters.loc[(user_clusters['clusterId']==cid)]
    sim_users = list(sim_users.index)
#     print(sim_users)
    similarity_vector = usm.loc[sim_users, str(userid)]
    if str(movieid) not in umm.columns:
        umm[str(movieid)] = 0.0
    rating_vector = umm.loc[sim_users, str(movieid)]
    rating = np.dot(similarity_vector, rating_vector)/(np.sum(np.abs(similarity_vector)))
    return round(rating*5, 2)

In [43]:
pred_ratings = []
for row in x_test:
#     print('user:{}, movie:{}'.format(row[0], row[1]))
    rat_temp = predictCF(row[0], row[1], user_clusters)
#     print('user:{}, movie:{}, rating:{}'.format(row[0], row[1], rat_temp))
    pred_ratings.append(rat_temp)
# print(pred_ratings)

In [44]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

print('RMSE:', mean_squared_error(y_test, pred_ratings))
print('MAE:', mean_absolute_error(y_test, pred_ratings))

RMSE: 10.397082794480367
MAE: 2.7562355842943806


In [45]:
# Method to get movie recommendations using CF on user clusters
def getRecommendationsCFCluster(userid):
    mat = []
    for movieid in mgm.index.values:
        score = predictCF(userid, movieid, user_clusters)
        mat.append([movieid, score])
    mat = pd.DataFrame(mat, columns=['movieId', 'score'])
    return mat

In [46]:
scores = getRecommendationsCFCluster(45)
scores.sort_values(by=['score'], ascending=False, inplace=True)
movies_recom_cfcluster = []
for index, row in scores.head(20).iterrows():
    mid = int(row['movieId'])
    mname = movies.loc[movies['movieId']==mid, 'title'].values[0]
    movies_recom_cfcluster.append(mname)
    score = row['score']
    print('{}\t\t{}'.format(mid, mname))

356		Forrest Gump (1994)
318		Shawshank Redemption, The (1994)
296		Pulp Fiction (1994)
593		Silence of the Lambs, The (1991)
260		Star Wars: Episode IV - A New Hope (1977)
527		Schindler's List (1993)
608		Fargo (1996)
110		Braveheart (1995)
2858		American Beauty (1999)
480		Jurassic Park (1993)
589		Terminator 2: Judgment Day (1991)
590		Dances with Wolves (1990)
457		Fugitive, The (1993)
150		Apollo 13 (1995)
588		Aladdin (1992)
1		Toy Story (1995)
2571		Matrix, The (1999)
50		Usual Suspects, The (1995)
858		Godfather, The (1972)
32		Twelve Monkeys (a.k.a. 12 Monkeys) (1995)


In [47]:
movies_recom_cb

['Stunt Man, The (1980)',
 'Rubber (2010)',
 'Osmosis Jones (2001)',
 'Nurse Betty (2000)',
 'Out of Sight (1998)',
 'Motorama (1991)',
 'Ghost (1990)',
 'Wonderful World of the Brothers Grimm, The (1962)',
 'What Happened Was... (1994)',
 'Rules of Attraction, The (2002)',
 'Beat the Devil (1953)',
 'Mulan (1998)',
 'Interstate 60 (2002)',
 'Getaway, The (1994)',
 'Click (2006)',
 'Seven-Per-Cent Solution, The (1976)',
 'Song of the Thin Man (1947)',
 'Casanova (2005)',
 'Legend of the Red Dragon (a.k.a. New Legend of Shaolin, The) (Hong Xi Guan: Zhi Shao Lin wu zu) (1994)',
 'Sisterhood of the Traveling Pants 2, The (2008)']

In [48]:
movies_recom_cf

['Pulp Fiction (1994)',
 'Forrest Gump (1994)',
 'Shawshank Redemption, The (1994)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Silence of the Lambs, The (1991)',
 "Schindler's List (1993)",
 'Fargo (1996)',
 'Matrix, The (1999)',
 'American Beauty (1999)',
 'Godfather, The (1972)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Usual Suspects, The (1995)',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
 'Jurassic Park (1993)',
 'Toy Story (1995)',
 'Back to the Future (1985)',
 'Terminator 2: Judgment Day (1991)',
 'Fugitive, The (1993)',
 'Braveheart (1995)',
 'Seven (a.k.a. Se7en) (1995)']

In [49]:
movies_recom_cfcluster

['Forrest Gump (1994)',
 'Shawshank Redemption, The (1994)',
 'Pulp Fiction (1994)',
 'Silence of the Lambs, The (1991)',
 'Star Wars: Episode IV - A New Hope (1977)',
 "Schindler's List (1993)",
 'Fargo (1996)',
 'Braveheart (1995)',
 'American Beauty (1999)',
 'Jurassic Park (1993)',
 'Terminator 2: Judgment Day (1991)',
 'Dances with Wolves (1990)',
 'Fugitive, The (1993)',
 'Apollo 13 (1995)',
 'Aladdin (1992)',
 'Toy Story (1995)',
 'Matrix, The (1999)',
 'Usual Suspects, The (1995)',
 'Godfather, The (1972)',
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)']

In [50]:
recom_movies = pd.DataFrame()
recom_movies['cb'] = movies_recom_cb
recom_movies['cf'] = movies_recom_cf
recom_movies['cf-cluster'] = movies_recom_cfcluster
recom_movies.head(20)

Unnamed: 0,cb,cf,cf-cluster
0,"Stunt Man, The (1980)",Pulp Fiction (1994),Forrest Gump (1994)
1,Rubber (2010),Forrest Gump (1994),"Shawshank Redemption, The (1994)"
2,Osmosis Jones (2001),"Shawshank Redemption, The (1994)",Pulp Fiction (1994)
3,Nurse Betty (2000),Star Wars: Episode IV - A New Hope (1977),"Silence of the Lambs, The (1991)"
4,Out of Sight (1998),"Silence of the Lambs, The (1991)",Star Wars: Episode IV - A New Hope (1977)
5,Motorama (1991),Schindler's List (1993),Schindler's List (1993)
6,Ghost (1990),Fargo (1996),Fargo (1996)
7,"Wonderful World of the Brothers Grimm, The (1962)","Matrix, The (1999)",Braveheart (1995)
8,What Happened Was... (1994),American Beauty (1999),American Beauty (1999)
9,"Rules of Attraction, The (2002)","Godfather, The (1972)",Jurassic Park (1993)


#### A comparison of recommendations show minor shifts in moives on using clusters of similar users only.
<br>
<br>

# HYBRID SYSTEM

In [63]:
def predictHybrid(userid, movieid, user_clusters):
    return round((predictCB(userid, movieid) + predictCF(userid, movieid, user_clusters))/2, 2)

In [64]:
pred_ratings = []
for row in x_test:
    pred_ratings.append(predictHybrid(row[0], row[1], user_clusters))

In [66]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

print('RMSE:', mean_squared_error(y_test, pred_ratings))
print('MAE:', mean_absolute_error(y_test, pred_ratings))

RMSE: 7.522258746083594
MAE: 2.4459832677821485


#### Here we have combined the results of both content-based and cluster based collaborative filtering methods. We can see that the performance of the system has improved drastically, compared to both of the separate approaches. We can see a drop of ~2.5 in RMSE as compared to both of the models. 
<br>
<br>

# GENETIC ALGORITHM

In [67]:
# Method to get the RMSE using hybrid system.
def getPerformance(k, niter):
    kmeans = KMeans(n_clusters=k, n_init=niter, random_state=0)
    clusters = kmeans.fit_predict(umm.values)
    user_clusters = pd.DataFrame(clusters, index=umm.index, columns=['clusterId'])
    pred_ratings = []
    for row in x_test:
        rat_temp = predictHybrid(row[0], row[1], user_clusters)
        pred_ratings.append(rat_temp)
    return round(mean_squared_error(y_test, pred_ratings), 3)

In [68]:
# method to get fitness(RMSE) of a population
def getPopulationFitness(pop):
    fitness = []
    for i in pop:
        print('Getting for chromosome:', i)
        fitness.append(getPerformance(i[0], i[1]))
    fitness = np.array(fitness)
    return fitness

# Selecting parents with best fitness (least RMSE)
def getMatingPool(pop, fitness, num_parents):
    parents = np.empty((num_parents, pop.shape[1]))
    for parent_num in range(num_parents):
        max_fitness_idx = np.where(fitness == np.min(fitness))
        max_fitness_idx = max_fitness_idx[0][0]
        parents[parent_num, :] = pop[max_fitness_idx, :]
        fitness[max_fitness_idx] = 99999999999
    return parents

# Cross over on the best parents to create offsprings
def crossover(parents, offspring_size):
    offspring = np.empty(offspring_size)
    crossover_point = np.uint8(offspring_size[1]/2)
    for k in range(offspring_size[0]):
        parent1_idx = k%parents.shape[0]
        parent2_idx = (k+1)%parents.shape[0]
        offspring[k, 0:crossover_point] = parents[parent1_idx, 0:crossover_point]
        offspring[k, crossover_point:] = parents[parent2_idx, crossover_point:]
    return offspring

# Mutate the crossover offsprings.
def mutation(offspring_crossover):
    for idx in range(offspring_crossover.shape[0]):
        random_value = np.random.randint(-1, 1, 1)
        offspring_crossover[idx, 1] = offspring_crossover[idx, 1] + random_value
        if offspring_crossover[idx, 1] <= 0:
            offspring_crossover[idx, 1] = 1
    return offspring_crossover

In [69]:
# initialising the intial populations and population size for GA

clustering_inputs = [4, 10]

# we have two genes per chromosome
# first gene will represent number of clusters
# and second will represent number of iterations
# for the input parameters of K-means clustering algorithm
num_weights = 2

# we have 10 chromosomes per population
sol_per_pop = 10

# we will select best 4 parents for mating
num_parents_mating = 4

pop_size = (sol_per_pop,num_weights)
new_population = np.random.randint(low=1, high=10, size=pop_size)
print(new_population)

[[2 8]
 [2 5]
 [8 9]
 [4 9]
 [1 1]
 [9 5]
 [1 8]
 [2 6]
 [2 2]
 [4 9]]


In [70]:
# Implementing GA for 5 genrations.
num_generations = 5
for generation in range(num_generations):
    print("Generation : ", generation)
    fitness = getPopulationFitness(new_population)
    parents = getMatingPool(new_population, fitness, num_parents_mating)
    offspring_crossover = crossover(parents, offspring_size=(pop_size[0]-parents.shape[0], num_weights))
    offspring_mutation = mutation(offspring_crossover)
    new_population[0:parents.shape[0], :] = parents
    new_population[parents.shape[0]:, :] = offspring_mutation
    print("Best result : ", np.min(fitness))
    print()

Generation :  0
Getting for chromosome: [2 8]
Getting for chromosome: [2 5]
Getting for chromosome: [8 9]
Getting for chromosome: [4 9]
Getting for chromosome: [1 1]
Getting for chromosome: [9 5]
Getting for chromosome: [1 8]
Getting for chromosome: [2 6]
Getting for chromosome: [2 2]
Getting for chromosome: [4 9]
Best result :  7.206

Generation :  1
Getting for chromosome: [2 5]
Getting for chromosome: [2 6]
Getting for chromosome: [2 2]
Getting for chromosome: [9 5]
Getting for chromosome: [2 5]
Getting for chromosome: [2 1]
Getting for chromosome: [2 4]
Getting for chromosome: [9 5]
Getting for chromosome: [2 5]
Getting for chromosome: [2 1]
Best result :  7.157

Generation :  2
Getting for chromosome: [2 5]
Getting for chromosome: [2 6]
Getting for chromosome: [2 2]
Getting for chromosome: [2 5]
Getting for chromosome: [2 6]
Getting for chromosome: [2 1]
Getting for chromosome: [2 4]
Getting for chromosome: [2 4]
Getting for chromosome: [2 5]
Getting for chromosome: [2 2]
Best res

In [71]:
# fitness = getPopulationFitness(new_population)
best_match_idx = np.where(fitness == np.min(fitness))

print("Best solution : ", new_population[best_match_idx, :])
print("Best solution fitness : ", fitness[best_match_idx])

Best solution :  [[[2 6]
  [2 2]
  [2 4]
  [2 5]
  [2 5]
  [2 2]]]
Best solution fitness :  [7.157 7.157 7.157 7.157 7.157 7.157]


#### We applied the Genetic algorithm to optimize the hybrid system's clustering based CF algorithm by altering the number of clusters and number of iterations through various generations. As a result we were able to optimize the system even further with RMSE dropping to 7.157 from 7.522, when we use 2 clusters for grouping users. Number of iterations doesn't have much effect on the performance of our model here. So we can use somewhere around 5-6 iterations, so that the grouping is more better, but the running time of algorithm is less.