In [260]:
import numpy as np
import pandas as pd
import feather as feather 
import matplotlib.pyplot as plt
import seaborn as sns
import random as random
import math
import sklearn.preprocessing
import json as json

In [2]:
%matplotlib inline

In [3]:
%cd /notebooks/storage/Recsys\ IMDb\ DATA/feather_data/

/storage/Recsys IMDb DATA/feather_data


In [4]:
%cd dataframes_csv/

/storage/Recsys IMDb DATA/feather_data/dataframes_csv


In [93]:
movies = pd.read_csv(r"movies.csv",low_memory=0)

In [None]:
movies = movies.drop('Unnamed: 0',axis=1)

In [195]:
movies.head()

Unnamed: 0,tconst,titleType,primaryTitle,isAdult,startYear,endYear,genres,averageRating,numVotes,directors,weight_average
0,tt0111161,movie,The Shawshank Redemption,0,1994,\N,Drama,9.3,2307975,nm0001104,9.227372
1,tt0468569,movie,The Dark Knight,0,2008,\N,"Action,Crime,Drama",9.0,2270954,nm0634240,8.936657
2,tt1375666,movie,Inception,0,2010,\N,"Action,Adventure,Sci-Fi",8.8,2033048,nm0634240,8.737265
3,tt0137523,movie,Fight Club,0,1999,\N,Drama,8.8,1828694,nm0000399,8.730554
4,tt0944947,tvSeries,Game of Thrones,0,2011,2019,"Action,Adventure,Drama",9.3,1728924,"nm0755261,nm0638354,nm0851930,nm0336241,nm0787...",9.204146


In [180]:
movies = movies[:40000]

# Making a simple randomized recommender based on Genre
- <font color = 'green'> Objective </font> : To make valid recommnedations for a user when he or she enters a <font color='cyan'>genre.</font> 
- This model quickly makes a preidiction on the basis of the genre entered by user and recommends the top movies in that genre, for the user to see.
- I think it is important to compute the weighted average by giving some weight to the <font color='green'> release year </font> of the movie as it can be safely assumed that newer movies would not be seen by the users.


In [95]:
# Quantile just returns what is the split point for this quantile q.
# here it means that 10% of th movies have no. of votes less than 1389 and 90% have votes >1389.
# Nice!
q = movies['numVotes'].quantile(0.10)
print("10% quantile : "+str(q))
print("Proving...")
a = movies['numVotes']<1389
f = [i for i in a if i== True]
print("No. of votes less than 1389 :",len(f))
v = round(sum(f)/len(movies),2)
print("The percentage of the data is",v*100,"%")
print("Correct!")

10% quantile : 1389.0
Proving...
No. of votes less than 1389 : 4997
The percentage of the data is 10.0 %
Correct!


## Calculating the weighted average
- A weighted average for the movies is calculated based on -
    - the minimum no. of votes required for the movie to be in the top 250
    - their actual votes 
    - the average rating given to the movie.
<font size = 4> Formula </font>
$$W = \frac{R*v + C*m}{v+m}$$
  where :
        - W is the weighted average
        - Rv is the average rating for the movie
        - Cm is the mean of all the average ratings 
        - v is the number of votes for the movie 
        - m is the minimum no. of votes for the movie to be included in top250

In [97]:
m = movies['numVotes'].quantile(0.95)
c = movies['averageRating'].mean()
v = movies['numVotes']>m
print("Number of movies with votes greater than the 95 percentile :",len([i for i in v if i == True]))


Number of movies with votes greater than the 95 percentile : 2500


In [98]:
movies['weight_average'] = (movies['averageRating']*movies['numVotes']+c*m)/(movies['numVotes']+m)

In [255]:
def recommend_weighted(genre):
    genre = "".join(genre.split()) # Remove white space
    data = {}
    genre = genre.capitalize()
    if('Science' in genre or 'Sci' in genre):
        genre = 'Sci-Fi'
    print("Genre :",genre)
    count=0
    for i in movies.index:
        genres = movies['genres'][i].split(',')
        if genre in genres:
            data[i] = movies.iloc[i]
    frame = pd.DataFrame.from_dict(data,orient='index',columns = movies.columns).reset_index().drop('index',axis=1).sort_values('weight_average',ascending=False)  
    # Removing redundant columns 
    remove_col = ['tconst','endYear','directors']
    frame = frame.drop(remove_col,axis = 1)
    # Returning a randomised sample of 15 rows out of the 100
    # This last line can be appended with a sort or can be left as it is
    return frame[:200].sample(frac=0.1)# .sort_values(by='weight_average',ascending=False).drop("weight_average",axis=1)

In [229]:
import functools as f
def comp(a,b):
    return a[1]*a[2] > b[1]*b[2]
def recommend_weighted_fast(genre):
    genre = "".join(genre.split())
    data = []
    genre = genre.capitalize()
    count=0
    for i in movies.index:
        genres = movies['genres'][i].split(',')
        if genre in genres:
            data.append([movies['primaryTitle'][i],movies['weight_average'][i],movies['startYear'][i]])
    final = sorted(data,key=f.cmp_to_key(comp))[:200]
    random.shuffle(final)
    res= []
    for k in final[:15]:
        res.append(k[0])
    return res

# Results
- We can see the <font color='green'>trivial recommenders</font> are also giving good results according to the genre that is given to them.
- Yet to do : <font color ='green'> recommend according to movie similarity </font>


In [258]:
recommend_weighted("Mystery")

Genre : Mystery


Unnamed: 0,titleType,primaryTitle,isAdult,startYear,genres,averageRating,numVotes,weight_average
148,tvSeries,How to Get Away with Murder,0,2014,"Crime,Drama,Mystery",8.1,123994,7.733432
60,tvSeries,Money Heist,0,2017,"Action,Crime,Mystery",8.4,280839,8.12433
81,tvSeries,The X-Files,0,1993,"Crime,Drama,Mystery",8.6,196049,8.181328
70,tvSeries,13 Reasons Why,0,2017,"Drama,Mystery,Thriller",7.6,251165,7.496257
57,movie,The Devil's Advocate,0,1997,"Drama,Mystery,Thriller",7.5,327477,7.435583
462,tvSeries,Upload,0,2020,"Comedy,Mystery,Sci-Fi",8.0,35055,7.424376
76,tvSeries,True Blood,0,2008,"Drama,Fantasy,Mystery",7.8,227123,7.635235
183,movie,The Name of the Rose,0,1986,"Crime,Drama,Mystery",7.7,100928,7.46619
261,tvSeries,The Good Wife,0,2009,"Crime,Drama,Mystery",8.3,67447,7.684921
52,tvSeries,Homeland,0,2011,"Crime,Drama,Mystery",8.3,307703,8.06434


In [259]:
recommend_weighted_fast("Mystery" )

['Happy Death Day',
 'Devil',
 'Secret Window',
 'The Gift',
 'The Sixth Sense',
 'Signs',
 '1408',
 'Saw',
 'Talk to Her',
 'Riverdale',
 'Black Mirror: Bandersnatch',
 'Sherlock',
 'The Jacket',
 'Annabelle',
 'The Secret in Their Eyes']