In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msn


In [5]:
df_movies = pd.read_table("dataSet/movies.dat",sep="\t")
pd.set_option('display.max_columns',None)

In [6]:
df_movies.rename(columns={'id': 'movieID'}, inplace=True)

In [7]:
df_genres = pd.read_table("dataSet/movie_genres.dat",sep="\t")

In [8]:
df_directors = pd.read_table("dataSet/movie_directors.dat",sep="\t")

In [9]:
df_actors = pd.read_table("dataSet/movie_actors.dat",sep="\t")

In [10]:
df_countries = pd.read_table("dataSet/movie_countries.dat",sep="\t")

In [11]:
df_locations = pd.read_table("dataSet/movie_locations.dat",sep="\t")

In [12]:
df_tags = pd.read_table("dataSet/tags.dat",sep="\t")

In [13]:
df_tags.rename(columns={'id': 'tagID'}, inplace=True)

In [14]:
df_user_tags = pd.read_table("dataSet/user_taggedmovies.dat",sep="\t")

In [15]:
df_movie_tags = pd.read_table("dataSet/movie_tags.dat",sep="\t")

In [16]:
df_ratings = pd.read_table("dataSet/user_ratedmovies.dat",sep="\t")

#### Merge above dataframes together which are important for the task

In [17]:
#df1.merge(df2, left_on='lkey', right_on='rkey')
df = df_movies.merge(df_ratings, on='movieID')

In [18]:
def get_genres(x):
    genres_list = df_genres[df_genres['movieID']== x].genre.unique()
    return ' '.join(genres_list)

In [19]:
df['genres'] = df['movieID'].apply(lambda x: get_genres(x))

In [20]:
df = df.merge(df_directors, on='movieID',how='left')

In [21]:
def get_actors(x):
    actors_list = df_actors[df_actors['movieID']== x].actorName.unique()
    if len(actors_list) >= 4:
        return ' '.join(actors_list[:4]) 
    else:
        return ' '.join(actors_list)

In [22]:
df['actors'] = df['movieID'].apply(lambda x: get_actors(x))

In [23]:
df = df.merge(df_countries, on='movieID')

In [24]:
movie_ratingCount = (df.groupby(by = ['title'])['rating'].count().reset_index().rename(columns = {'rating': 'ratingCount'})[['title', 'ratingCount']])

In [25]:
df_new = df.merge(movie_ratingCount, left_on = 'title', right_on = 'title', how = 'left')

In [26]:
is_duplicate = df_new.duplicated()

In [27]:
print("Number of duplicate records:", sum(is_duplicate))

Number of duplicate records: 0


In [52]:
df_cb = df_new.drop_duplicates(subset ="title", keep = 'last')

In [53]:
df_cb.isnull().sum()

movieID                     0
title                       0
imdbID                      0
spanishTitle                0
imdbPictureURL            169
year                        0
rtID                      286
rtAllCriticsRating          0
rtAllCriticsNumReviews      0
rtAllCriticsNumFresh        0
rtAllCriticsNumRotten       0
rtAllCriticsScore           0
rtTopCriticsRating          0
rtTopCriticsNumReviews      0
rtTopCriticsNumFresh        0
rtTopCriticsNumRotten       0
rtTopCriticsScore           0
rtAudienceRating            0
rtAudienceNumRatings        0
rtAudienceScore             0
rtPictureURL                0
userID                      0
rating                      0
date_day                    0
date_month                  0
date_year                   0
date_hour                   0
date_minute                 0
date_second                 0
genres                      0
directorID                 39
directorName               39
actors                      0
country   

In [54]:
df_cb = df_cb.fillna('unknown')

In [55]:
df_cb = df_cb[['movieID','title','userID','rating','genres','directorName','actors','country','ratingCount']]

In [56]:
df_cb.head(2)

Unnamed: 0,movieID,title,userID,rating,genres,directorName,actors,country,ratingCount
1262,1,Toy story,71529,4.5,Adventure Animation Children Comedy Fantasy,John Lasseter,Annie Potts Bill Farmer Don Rickles Erik von D...,USA,1263
2027,2,Jumanji,71497,3.5,Adventure Children Fantasy,Joe Johnston,Peter Bryant Adam Hann-Byrd Bebe Neuwirth Bonn...,USA,765


In [66]:
popularity_threshold = 300
popular_movie=df_cb.query('ratingCount >= @popularity_threshold')

In [67]:
popular_movie.shape

(772, 9)

In [68]:
popular_movie['description'] = popular_movie['title']+' '+popular_movie['genres']+' '+popular_movie['directorName']+' '+popular_movie['actors']+' '+popular_movie['country']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [69]:
popular_movie.head(2)

Unnamed: 0,movieID,title,userID,rating,genres,directorName,actors,country,ratingCount,description
1262,1,Toy story,71529,4.5,Adventure Animation Children Comedy Fantasy,John Lasseter,Annie Potts Bill Farmer Don Rickles Erik von D...,USA,1263,Toy story Adventure Animation Children Comedy ...
2027,2,Jumanji,71497,3.5,Adventure Children Fantasy,Joe Johnston,Peter Bryant Adam Hann-Byrd Bebe Neuwirth Bonn...,USA,765,Jumanji Adventure Children Fantasy Joe Johnsto...


In [70]:
popular_movie['description'] = popular_movie['description'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [65]:
popular_movie['description'][1262]

'Toy story Adventure Animation Children Comedy Fantasy John Lasseter Annie Potts Bill Farmer Don Rickles Erik von Detten USA'

In [86]:
popular_movie[['title','userID','description']].to_csv('modified_content_dataSet.csv',index=False)

In [87]:
dff = pd.read_csv('modified_content_dataSet.csv')

In [88]:
dff.head(2)

Unnamed: 0,title,userID,description
0,Toy story,71529,toy story adventure animation children comedy ...
1,Jumanji,71497,jumanji adventure children fantasy joe johnsto...


In [89]:
dff.isnull().sum()

title          0
userID         0
description    0
dtype: int64

In [90]:
dff.shape

(772, 3)

In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [92]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(dff['description'])

In [93]:
tfidf_matrix.shape

(772, 12535)

In [104]:
from sklearn.metrics.pairwise import linear_kernel

In [105]:
sig = linear_kernel(tfidf_matrix, tfidf_matrix)

In [106]:
dff = dff.reset_index()
titles = dff['title']
indices = pd.Series(dff.index, index=dff['title'])
#indices.head(2)

In [116]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(sig[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices].values.tolist()[:10]

In [117]:
get_recommendations('The Godfather')

['The Godfather: Part II',
 'The Godfather: Part III',
 'Apocalypse Now',
 'Unforgiven',
 'Tombstone',
 '3:10 to Yuma',
 'Psycho',
 'The Boondock Saints',
 'Michael Clayton',
 'The Great Escape']

In [109]:
get_recommendations('The Dark Knight').head(10)

712    Thank You for Smoking
504                 Insomnia
682            Batman Begins
382          Erin Brockovich
100            The Cable Guy
27              First Knight
434                  Memento
732             The Prestige
601              Man on Fire
650      Grosse Pointe Blank
Name: title, dtype: object