# Content Based Recommendation System

# Q1 Read the Dataset `movies_metadata.csv`

In [672]:
import pandas as pd
import numpy as np
import ast
from pandas import isnull

In [673]:
movies_df = pd.read_csv("movies_metadata-1.csv", low_memory = False)

In [674]:
def convertStringtoDict(x):
    if(isnull(x) == False):
        return ast.literal_eval(x)
    else:
        return 0

In [675]:
dictCols = ['belongs_to_collection', 'genres', 'spoken_languages']
for col in movies_df.columns:
    if(col in dictCols):
        movies_df[col] = movies_df[col].apply(convertStringtoDict)

In [676]:
movies_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,0,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,0,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [677]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    45466 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45466 non-null object
status                   45379 non-null obje

In [678]:
movies_df.isna().sum()

adult                        0
belongs_to_collection        0
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             0
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64


# Q2 Create a new column with name 'description' combining 'overview' and 'tagline' columns in the given dataset

In [679]:
movies_df['description'] = movies_df['overview'] + movies_df['tagline']

In [680]:
movies_df.description[1]

"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures.Roll the dice and unleash the excitement!"

# Q3  Lets drop the null values in `description` column

In [681]:
movies_df.dropna(subset = ['description'], inplace = True)

In [682]:
movies_df.isna().sum()

adult                        0
belongs_to_collection        0
budget                       0
genres                       0
homepage                 15994
id                           0
imdb_id                      4
original_language            0
original_title               0
overview                     0
popularity                   0
poster_path                 15
production_companies         0
production_countries         0
release_date                14
revenue                      0
runtime                      0
spoken_languages             0
status                      14
tagline                      0
title                        0
video                        0
vote_average                 0
vote_count                   0
description                  0
dtype: int64

# Q4 Keep the first occurance and drop duplicates of each title in column 'title'

In [683]:
movies_df.drop_duplicates(keep = 'first', subset = ['title'], inplace = True)

# Q5   As we might have dropped a few rows with duplicate `title` in above step, just reset the index [make sure you are not adding any new column to the dataframe while doing reset index]

In [684]:
movies_df.reset_index(drop=True)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,description
0,False,0,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,When siblings Judy and Peter discover an encha...
1,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,A family wedding reignites the ancient feud be...
2,False,0,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"Cheated on, mistreated and stepped on, the wom..."
3,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,Just when George Banks has recovered from his ...
4,False,0,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,187436818.0,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0,"Obsessive master thief, Neil McCauley leads a ..."
5,False,0,58000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,11860,tt0114319,en,Sabrina,An ugly duckling having undergone a remarkable...,...,0.0,127.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,You are cordially invited to the most surprisi...,Sabrina,False,6.2,141.0,An ugly duckling having undergone a remarkable...
6,False,0,0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,45325,tt0112302,en,Tom and Huck,"A mischievous young boy, Tom Sawyer, witnesses...",...,0.0,97.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The Original Bad Boys.,Tom and Huck,False,5.4,45.0,"A mischievous young boy, Tom Sawyer, witnesses..."
7,False,0,35000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,9091,tt0114576,en,Sudden Death,International action superstar Jean Claude Van...,...,64350171.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Terror goes into overtime.,Sudden Death,False,5.5,174.0,International action superstar Jean Claude Van...
8,False,"{'id': 645, 'name': 'James Bond Collection', '...",58000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.mgm.com/view/movie/757/Goldeneye/,710,tt0113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,...,352194034.0,130.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,No limits. No fears. No substitutes.,GoldenEye,False,6.6,1194.0,James Bond must unmask the mysterious head of ...
9,False,0,62000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,9087,tt0112346,en,The American President,"Widowed U.S. president Andrew Shepherd, one of...",...,107879496.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Why can't the most powerful man in the world h...,The American President,False,6.5,199.0,"Widowed U.S. president Andrew Shepherd, one of..."


# Q6    Generate tf-idf matrix using the column `description`. Consider till 3-grams, with minimum document frequency as 0.

Hint:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')

In [685]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vect = TfidfVectorizer(analyzer='word',ngram_range=(1,3),stop_words='english', min_df = 0)
tf_idf_vect.fit(movies_df['description'])
desc_matrix = tf_idf_vect.transform(movies_df["description"])

# Q7  Create cosine similarity matrix

In [686]:
from sklearn.metrics.pairwise import cosine_similarity

In [687]:
cosine_sim_titles = cosine_similarity(desc_matrix)

# Q8  Write a function with name `recommend` which takes `title` as argument and returns a list of 10 recommended title names in the output based on the above cosine similarities

Hint:

titles = df['title'] <br>
indices = pd.Series(df.index, index=df['title']) <br>

def recommend(title): <br>
    idx = indices[title] <br>
    sim_scores = list(enumerate(cosine_similarities[idx])) <br>
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) <br>
    sim_scores = sim_scores[1:31] <br>
    movie_indices = [i[0] for i in sim_scores] <br>
    return titles.iloc[movie_indices] <br>

In [688]:
titles = movies_df[['title']]
indices = pd.Series(movies_df.index, index = movies_df['title'])

In [689]:
def recommend(title):
    idx = indices.get(title)
    sim_scores = list(enumerate(cosine_sim_titles[idx]))
    sim_scores = sorted(sim_scores, key = lambda x : x[1], reverse = True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [690]:
recommend('Jumanji')

Unnamed: 0,title
3330,Grumpy Old Men
23877,The Hundred-Foot Journey
9116,Wishful Thinking
443,Fearless
21897,Bait Shop
456,Go Fish
235,A Goofy Movie
31481,Burying the Ex
844,Gone Fishin'


# Q9 Give the recommendations from above functions for movies `The Godfather` and `The Dark Knight Rises`

In [691]:
recommend('The Godfather')

Unnamed: 0,title
24240,Cabin Fever: Patient Zero
13358,Beachhead
22383,Escape to Athena
28040,The Glory Brigade
8431,Isle of the Dead
6301,"Heaven Knows, Mr. Allison"
12768,Mamma Mia!
25705,Fishtales
4235,Too Late the Hero


In [692]:
recommend('The Dark Knight Rises')

Unnamed: 0,title
8079,Masques
22243,All Things To All Men
32377,Cop Car
12660,Cold Sweat
12591,Redbelt
39900,Amateur Porn Star Killer 3: The Final Chapter
32891,Knock Knock
3642,F/X2
22560,A Single Shot


# Popularity Based Recommendation System

### About Dataset

Anonymous Ratings on jokes.

1. Ratings are real values ranging from -10.00 to +10.00 (the value "99" corresponds to "null" = "not rated").

2. One row per user

3. The first column gives the number of jokes rated by that user. The next 100 columns give the ratings for jokes 01 - 100.

# Q10 Read the dataset(jokes.csv)



In [693]:
jokes_df = pd.read_csv("jokes-1.csv")

In [694]:
jokes_df.head()

Unnamed: 0,NumJokes,Joke1,Joke2,Joke3,Joke4,Joke5,Joke6,Joke7,Joke8,Joke9,...,Joke91,Joke92,Joke93,Joke94,Joke95,Joke96,Joke97,Joke98,Joke99,Joke100
0,74,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,100,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,49,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,48,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,91,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


In [695]:
jokes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24983 entries, 0 to 24982
Columns: 101 entries, NumJokes to Joke100
dtypes: float64(100), int64(1)
memory usage: 19.3 MB


# Q11 Consider `ratings` named dataframe with only first 200 rows and all columns from 1(first column is 0) of dataset

In [696]:
ratings = jokes_df.iloc[0:200, 1:]

In [697]:
ratings.head()

Unnamed: 0,Joke1,Joke2,Joke3,Joke4,Joke5,Joke6,Joke7,Joke8,Joke9,Joke10,...,Joke91,Joke92,Joke93,Joke94,Joke95,Joke96,Joke97,Joke98,Joke99,Joke100
0,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,-4.76,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,9.22,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,1.84,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,5.73,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


# Q12 Change the column indices from 0 to 99

In [698]:
ratings.columns = range(100)

In [699]:
ratings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,-4.76,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,9.22,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,1.84,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,5.73,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


# Q13 In the dataset, the null ratings are given as 99.00, so replace all 99.00s with 0
Hint: You can use `ratings.replace(<the given value>, <new value you wanted to change with>)`

In [700]:
ratings.replace(99.00, 0, inplace = True)

In [701]:
ratings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,-4.76,...,2.82,0.0,0.0,0.0,0.0,0.0,-5.63,0.0,0.0,0.0
1,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,9.22,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,0.0,0.0,0.0,0.0,9.03,9.27,9.03,9.27,0.0,0.0,...,0.0,0.0,0.0,9.08,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,8.35,0.0,0.0,1.8,8.16,-2.82,6.21,0.0,1.84,...,0.0,0.0,0.0,0.53,0.0,0.0,0.0,0.0,0.0,0.0
4,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,5.73,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


# Q14 Normalize the ratings using StandardScaler and save them in `ratings_diff` variable

In [702]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
ratings_diff = pd.DataFrame(sc.fit_transform(ratings))

### Popularity based recommendation system

# Q15  Find the mean for each column  in `ratings_diff` i.e, for each joke
Consider all the mean ratings and find the jokes with highest mean value and display the top 10 joke IDs.

In [703]:
mean_ratings = pd.DataFrame(ratings_diff.describe().loc['mean'])

In [704]:
mean_ratings_df = mean_ratings.set_index(ratings_diff.columns).T

In [705]:
sorted_ratings = sorted(mean_ratings_df.T.values, reverse = True)

In [706]:
sorted_df = mean_ratings_df.iloc[:, np.argsort(mean_ratings_df.loc['mean'])].T

In [707]:
top_rated = sorted_df.sort_values(by = 'mean', ascending = False)

In [708]:
top_rated.head(10)

Unnamed: 0,mean
98,1.840889e-16
81,1.24345e-16
97,1.198694e-16
20,1.065814e-16
94,1.054712e-16
92,8.770762000000001e-17
47,7.771561000000001e-17
73,7.743806000000001e-17
99,7.549517e-17
49,7.494005000000001e-17


In [709]:
joke_index = []
for col in top_rated.T.columns:
    col = 'Joke' + str(col + 1)
    joke_index.append(col)

In [710]:
indexed_jokes = pd.DataFrame(top_rated.values, index = joke_index)

In [711]:
indexed_jokes.head(10)

Unnamed: 0,0
Joke99,1.840889e-16
Joke82,1.24345e-16
Joke98,1.198694e-16
Joke21,1.065814e-16
Joke95,1.054712e-16
Joke93,8.770762000000001e-17
Joke48,7.771561000000001e-17
Joke74,7.743806000000001e-17
Joke100,7.549517e-17
Joke50,7.494005000000001e-17


From the above, we see that the below jokes are the top rated (with IDs in brackets):

1. Joke 99 (ID: 98)
2. Joke 82 (ID: 81)
3. Joke 98 (ID: 97)
4. Joke 21 (ID: 20)
5. Joke 95 (ID: 94)
6. Joke 93 (ID: 92)
7. Joke 48 (ID: 47)
8. Joke 74 (ID: 73)
9. Joke 100 (ID: 99)
10. Joke 50 (ID: 49)