In [96]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances 

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules, apriori

This is a develpment dataset (ml-latest-small). It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.

Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.

The data are contained in the files 'links.csv', 'movies.csv', 'ratings.csv' and 'tags.csv'. 

Files

ratings.csv :- userId,movieId,rating,timestamp
tags.csv :- userId,movieId,tag,timestamp
movies.csv :- movieId,title,genres
links.csv :- movieId,imdbId,tmdbId


Attributes

User id :- User ids are consistent between 'ratings.csv' and 'tags.csv'

Movie Ids :- Only movies with at least one rating or tag are included in the dataset. Movie ids are consistent between 'ratings.csv', 'tags.csv', 'movies.csv', and 'links.csv'. It is an identifier for movies used by <https://movielens.org>.

rating :- Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).

Timestamps :- Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.

Imd Id :- imdbId is an identifier for movies used by <http://www.imdb.com>.

Tmb Id :- tmdbId is an identifier for movies used by <https://www.themoviedb.org>.

Tags :- Tags are user-generated metadata about movies. Each tag is typically a single word or short phrase. The meaning, value,         and purpose of a particular tag is determined by each user.


# User-Based Similarity 

In [97]:
df = pd.read_csv("ratings.csv")
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,9.649827e+08
1,1,3,4.0,9.649812e+08
2,1,6,4.0,9.649822e+08
3,1,47,5.0,9.649838e+08
4,1,50,5.0,9.649829e+08
...,...,...,...,...
80786,509,95193,2.0,1.436001e+09
80787,509,95558,3.0,1.436030e+09
80788,509,95858,3.5,1.436028e+09
80789,509,96110,3.0,1.435999e+09


## 2) Pivot table 

In [98]:
dg=pd.DataFrame(pd.pivot(df, index="userId", columns="movieId",values='rating'))
dg.head(10)
dg.replace(np.nan, 0, inplace=True)
dg


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 3) Similarity among users 

In [99]:
distance= pairwise_distances(dg, metric='euclidean')
distance

array([[ 0.        , 70.43614129, 69.33613776, ..., 67.61656602,
        67.51481319, 88.12632978],
       [70.43614129,  0.        , 29.45759664, ..., 26.81883666,
        24.62722071, 68.64765109],
       [69.33613776, 29.45759664,  0.        , ..., 25.42636427,
        23.1030301 , 69.26579242],
       ...,
       [67.61656602, 26.81883666, 25.42636427, ...,  0.        ,
        19.62778643, 67.57403347],
       [67.51481319, 24.62722071, 23.1030301 , ..., 19.62778643,
         0.        , 67.42032335],
       [88.12632978, 68.64765109, 69.26579242, ..., 67.57403347,
        67.42032335,  0.        ]])

In [128]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(distance)
similarity


# optionally convert it to a DataFrame
sim = pd.DataFrame(similarity, index=df.userId.unique(), columns=df.userId.unique())
jac_sim

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,500,501,502,503,504,505,506,507,508,509
1,1.000000,0.954491,0.951977,0.995614,0.957027,0.994995,0.988972,0.955194,0.959958,0.985514,...,0.976433,0.955030,0.957489,0.977513,0.981026,0.959880,0.957213,0.935899,0.934153,0.996277
2,0.954491,1.000000,0.999003,0.970227,0.997576,0.964936,0.984019,0.996509,0.998766,0.987897,...,0.994535,0.998500,0.998917,0.994041,0.991906,0.998853,0.999071,0.996357,0.996999,0.956050
3,0.951977,0.999003,1.000000,0.968213,0.997822,0.962880,0.982304,0.996757,0.998746,0.986158,...,0.993944,0.998880,0.999038,0.992852,0.990863,0.998759,0.999053,0.997336,0.997979,0.952987
4,0.995614,0.970227,0.968213,1.000000,0.972047,0.995742,0.994759,0.970148,0.974617,0.993007,...,0.987186,0.970591,0.972683,0.987782,0.990487,0.974565,0.972509,0.954606,0.953466,0.994999
5,0.957027,0.997576,0.997822,0.972047,1.000000,0.969193,0.984912,0.999083,0.997706,0.987736,...,0.994971,0.998211,0.998039,0.993900,0.992000,0.997928,0.997913,0.996862,0.995178,0.957480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,0.959880,0.998853,0.998759,0.974565,0.997928,0.969609,0.986876,0.996927,0.998995,0.990067,...,0.996185,0.998556,0.999008,0.995633,0.993887,1.000000,0.999029,0.995506,0.995771,0.960858
506,0.957213,0.999071,0.999053,0.972509,0.997913,0.967490,0.985495,0.996804,0.999006,0.989155,...,0.995512,0.998749,0.999078,0.994632,0.993024,0.999029,1.000000,0.996103,0.996575,0.958722
507,0.935899,0.996357,0.997336,0.954606,0.996862,0.950449,0.972205,0.996909,0.995222,0.976528,...,0.987564,0.996669,0.996110,0.985928,0.982830,0.995506,0.996103,1.000000,0.998584,0.936688
508,0.934153,0.996999,0.997979,0.953466,0.995178,0.947173,0.971157,0.994239,0.995861,0.976105,...,0.987115,0.996782,0.996591,0.985562,0.982557,0.995771,0.996575,0.998584,1.000000,0.935354


## 6) 5 most similar users with user id 10 

In [111]:
arr = [np.argsort(similarity[9])[-6:-1]]  #gets the list indices sorted in ascending order(top 5)
arr = np.asarray(arr)+1   #user id is the index+1
arr

#similarity[9][227]

array([[ 24,  83, 466, 460,  98]], dtype=int64)

In [None]:
# movies = pd.read_csv("movies.csv")
movies


## 7) Common movies between user 2 and user 338 

In [106]:
for i in dg.columns:
    a = int(dg[dg.index==2][i])
    b = int(dg[dg.index==338][i])
    if ((a!= 0) and (b!=0)) :
        print(movies[movies["movieId"]==i]["title"])
    


277    Shawshank Redemption, The (1994)
Name: title, dtype: object
4615    Kill Bill: Vol. 1 (2003)
Name: title, dtype: object


## 8) Common movies between user 2 and 338 with least rating of 4.0 

In [107]:
for i in dg.columns:
    a = float(dg[dg.index==2][i])
    b = float(dg[dg.index==338][i])
    if ((a>=4) and (b>=4)) :
        print(movies[movies["movieId"]==i]["title"])
    


4615    Kill Bill: Vol. 1 (2003)
Name: title, dtype: object


# Item based similarity

## Pre-processing

In [None]:
dh=pd.read_csv("genre.csv")
dh

In [14]:
records=list()

for i in range(0,9741):
     records.append([str(dh.values[i,j]) for j in range(0,9)])
records

[['Adventure',
  'Animation',
  'Children',
  'Comedy',
  'Fantasy',
  'nan',
  'nan',
  'nan',
  'nan'],
 ['Adventure',
  'Children',
  'Fantasy',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan',
  'nan'],
 ['Comedy', 'Romance', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['Comedy', 'Drama', 'Romance', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['Comedy', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['Action', 'Crime', 'Thriller', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['Comedy', 'Romance', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['Adventure', 'Children', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['Action', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['Action', 'Adventure', 'Thriller', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['Comedy', 'Drama', 'Romance', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['Comedy', 'Horror', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan'],
 ['Adventure',
  'Animation',
  'Children',
  'nan',
  'na

In [70]:
te = TransactionEncoder()
arr=te.fit(records).transform(records)
df = pd.DataFrame(arr, columns=te.columns_)

df.drop(columns=['nan'], axis=1, inplace=True)
df.head(11)

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,False,False,True,True,True,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False
1,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False
3,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False
4,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False
6,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False
7,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
8,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
9,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False


In [71]:
from sklearn.metrics import jaccard_score
dg=df
dg=dg.join(movies['movieId'])
dg=dg.set_index(dg['movieId'])
dg

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,False,False,True,True,True,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,1
2,False,False,True,False,True,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,2
3,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,3
4,False,False,False,False,False,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,4
5,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193579,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,193579
193581,False,True,False,True,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,193581
193583,False,False,False,True,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,193583
193585,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,193585


## 9)Table representing the similarity between movies 

In [69]:
jac_sim = 1 - pairwise_distances(dg, metric = "hamming")
# optionally convert it to a DataFrame
jac_sim = pd.DataFrame(jac_sim, index=dg['movieId'], columns=dg['movieId'])
jac_sim

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193567,193571,193573,193579,193581,193583,193585,193587
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.857143,0.714286,0.666667,0.761905,0.571429,0.714286,0.809524,0.666667,0.666667,...,0.714286,0.714286,0.714286,0.714286,0.761905,0.666667,0.809524,0.857143,0.666667,0.714286
2,0.857143,1.000000,0.714286,0.666667,0.761905,0.666667,0.714286,0.904762,0.761905,0.761905,...,0.714286,0.619048,0.714286,0.714286,0.761905,0.761905,0.714286,0.761905,0.761905,0.714286
3,0.714286,0.714286,1.000000,0.904762,0.904762,0.714286,0.952381,0.761905,0.809524,0.714286,...,0.761905,0.761905,0.761905,0.857143,0.809524,0.809524,0.761905,0.809524,0.809524,0.761905
4,0.666667,0.666667,0.904762,1.000000,0.857143,0.666667,0.904762,0.714286,0.761905,0.666667,...,0.714286,0.714286,0.809524,0.904762,0.761905,0.761905,0.714286,0.761905,0.857143,0.714286
5,0.761905,0.761905,0.904762,0.857143,1.000000,0.761905,0.904762,0.809524,0.857143,0.761905,...,0.809524,0.809524,0.809524,0.904762,0.857143,0.857143,0.809524,0.857143,0.857143,0.809524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193579,0.666667,0.761905,0.809524,0.761905,0.857143,0.761905,0.809524,0.809524,0.857143,0.761905,...,0.714286,0.714286,0.809524,0.809524,0.857143,1.000000,0.714286,0.761905,0.857143,0.809524
193581,0.809524,0.714286,0.761905,0.714286,0.809524,0.714286,0.761905,0.666667,0.809524,0.714286,...,0.761905,0.857143,0.761905,0.761905,0.809524,0.714286,1.000000,0.904762,0.714286,0.857143
193583,0.857143,0.761905,0.809524,0.761905,0.857143,0.666667,0.809524,0.714286,0.761905,0.666667,...,0.714286,0.809524,0.809524,0.809524,0.857143,0.761905,0.904762,1.000000,0.761905,0.809524
193585,0.666667,0.761905,0.809524,0.857143,0.857143,0.761905,0.809524,0.809524,0.857143,0.761905,...,0.714286,0.714286,0.904762,0.904762,0.857143,0.857143,0.714286,0.761905,1.000000,0.809524


## 10) Top 5 movies similar to godfather 

Top 5 movies similar to godfather(crime, drama) based on genre

In [41]:
from sklearn.metrics.pairwise import pairwise_distances
lst=list()
lst=(1 - pairwise_distances(dh, metric = "hamming"))

arr = [np.argsort(lst[659])[-6:-1]]  #gets the list indices sorted in ascending order(top 5)


for i in range(0,5):
    print("\nMovie ", i+1, " :- ", movies.iloc[arr[0][i]]['title'], " genre :- ", movies.iloc[arr[0][i]]['genres'] )


Movie  1  :-  Shattered Glass (2003)  genre :-  Crime|Drama

Movie  2  :-  Paranoid Park (2007)  genre :-  Crime|Drama

Movie  3  :-  Monster (2003)  genre :-  Crime|Drama

Movie  4  :-  Capote (2005)  genre :-  Crime|Drama

Movie  5  :-  Lawless (2012)  genre :-  Crime|Drama
