# Recommendation System for IMDb DATA


In [57]:
import numpy as np
import pandas as pd
import feather as feather
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import *
from urllib import request
import gzip
from pathlib import Path
import re

In [58]:
!jt -t chesterish

In [59]:
%matplotlib inline


## Downloading and decompressing the required data

In [2]:
def download(name,url):
    '''A small code to download and decompress the files'''
    base = "https://datasets.imdbws.com/"
    %cd /notebooks/storage/
    out_file = "Recsys IMDb DATA/" + name + ".tsv"
    # name with which you want to store the data
    if Path(out_file).is_file():
        print(name +" exists already...")
        return
    try:
        response = request.urlopen(base+url)
        with open(out_file,'wb') as outfile:
            outfile.write(gzip.decompress(response.read()))
    except:
        print("Please enter a valid url...")

In [3]:
url = {"Ratings":"title.ratings.tsv.gz"
       ,"Title(basic)" : "title.basics.tsv.gz"
        , "Names": "name.basics.tsv.gz",
        "Directors":"title.crew.tsv.gz"
        , "Title(extra)":"title.akas.tsv.gz",
        "Mains":"title.principals.tsv.gz"
       }

In [4]:
download("Ratings",url["Ratings"])
download("Directors",url["Directors"])
download("Title(basic)",url["Title(basic)"])
download("Names",url["Names"])
download("Title(extra)",url["Title(extra)"])
download("Mains",url["Mains"])

/storage
Ratings exists already...
/storage
Directors exists already...
/storage
Title(basic) exists already...
/storage
Names exists already...
/storage
Title(extra) exists already...
/storage
Mains exists already...


## Saving acquired data 
- The downloaded data is stored in *feather* format for fast retrieval 
- Note that it resides in **feather_data** directory in the **DATA** folder

In [46]:
rating = pd.read_csv("Ratings.tsv",sep="\t")
rating.to_feather("feather_data/ratings")


In [66]:
director = pd.read_csv("Directors.tsv",sep="\t")
director = director.drop("writers",axis=1)
director.to_feather("feather_data/director")

In [55]:
names = pd.read_csv("Names.tsv",sep="\t")
names.to_feather("feather_data/names")

In [58]:
title_B = pd.read_csv("Title(basic).tsv",sep="\t",low_memory=False)
title_B.to_feather("feather_data/title_basic")
title_E = pd.read_csv("Title(extra).tsv",sep="\t",low_memory=False)
title_E.to_feather("feather_data/title_extra")


In [5]:
mains = pd.read_csv("Mains.tsv",sep="\t",low_memory=0)
mains.to_feather("feather_data/mains")

## Reading from feather
- **Always always always read from feather format...**

In [60]:
def read_data(name):
    %cd /notebooks/storage/Recsys IMDb DATA/feather_data
    df = feather.read_dataframe(name)
    return df

In [4]:
rating = read_data("ratings")

/storage/Recsys IMDb DATA/feather_data


# To truncate firstly according to ratings and votes

- It is observed that a higher average rating does not necessarily mean a more **credible** rating
- If a movie has a average rating of 10 but the number of votes for that movie is only 1, then that rating **is not very credible**
- Solution -> introduce a column with the **rating * number of votes** value as its entries and sort according to that column

In [5]:
product = rating['averageRating']*rating['numVotes']
product

0        21464167.5
1        20438586.0
2        17890822.4
3        16092507.2
4        16078993.2
            ...    
49995        8619.9
49996        8619.8
49997        8619.6
49998        8619.4
49999        8619.4
Length: 50000, dtype: float64

In [6]:
rating["product_col"] = product
rating

Unnamed: 0,tconst,averageRating,numVotes,product_col
0,tt0111161,9.3,2307975,21464167.5
1,tt0468569,9.0,2270954,20438586.0
2,tt1375666,8.8,2033048,17890822.4
3,tt0137523,8.8,1828694,16092507.2
4,tt0944947,9.3,1728924,16078993.2
...,...,...,...,...
49995,tt0063231,5.9,1461,8619.9
49996,tt0455135,4.7,1834,8619.8
49997,tt2182427,6.6,1306,8619.6
49998,tt0062113,7.1,1214,8619.4


In [8]:
rating.sort_values(by="numVotes",ascending=0)[:600000]

Unnamed: 0,tconst,averageRating,numVotes,product_col
0,tt0111161,9.3,2307975,21464167.5
1,tt0468569,9.0,2270954,20438586.0
2,tt1375666,8.8,2033048,17890822.4
3,tt0137523,8.8,1828694,16092507.2
5,tt0110912,8.9,1801215,16030813.5
...,...,...,...,...
48559,tt7822474,9.7,928,9001.6
49268,tt5869624,9.5,928,8816.0
48558,tt12339616,9.7,928,9001.6
49461,tt0666576,9.6,913,8764.8


In [9]:
rating.sort_values(by="product_col",ascending=0)[:600000]

Unnamed: 0,tconst,averageRating,numVotes,product_col
0,tt0111161,9.3,2307975,21464167.5
1,tt0468569,9.0,2270954,20438586.0
2,tt1375666,8.8,2033048,17890822.4
3,tt0137523,8.8,1828694,16092507.2
4,tt0944947,9.3,1728924,16078993.2
...,...,...,...,...
49995,tt0063231,5.9,1461,8619.9
49996,tt0455135,4.7,1834,8619.8
49997,tt2182427,6.6,1306,8619.6
49998,tt0062113,7.1,1214,8619.4


### Analysing ranks of movies 
- Now we analyse that a movie had which rank in the sorted data frames
- First its rank is calculated as present in the data frame **sorted according to NUM OF VOTES**
- Then its rank is calculated as present in the data frame **sorted according to PRODUCT VALUES**

In [70]:
id1 = rating.sort_values(by="numVotes",ascending=0)[:10000]['tconst']

In [71]:
id2 = rating.sort_values(by="product_col",ascending=0)[:10000]['tconst']

In [73]:
c1 = 1
deviation = []
for k in id1:
    c2=1
    for j in id2:
        if(k==j):
#             print(k)
#             print("Rank in one:",c1,"\nRank in two:",c2)
#             print("Difference in ranks :",abs(c1-c2))
            deviation.append(abs(c1-c2))
            break
        else:
            c2+=1
    c1+=1
m = sum(deviation)/(len(deviation))
print("Means ,on average, a movie is differing by",m,"ranks in the two dataframes for the first 10000 items and thus our assumption seems to be correct")

Means ,on average, a movie is differing by 412.2270111936395 ranks in the two dataframes for the first 10000 items and thus our assumption seems to be correct


## Taking the first 50,000 movies from the sorted dataframe 
- The first 50,000 movies are truncated from the dataframe and stored for the use as our actual dataset.


In [92]:
rating_keep = rating.sort_values(by="product_col",ascending=0).reset_index(drop=True)

In [97]:
rating_final = rating_keep[:50000]

In [106]:
%cd /notebooks/storage/Recsys IMDb DATA
rating_final.to_feather("feather_data/ratings")

/storage/Recsys IMDb DATA


In [61]:
rating= read_data("ratings")

/storage/Recsys IMDb DATA/feather_data


In [62]:
rating

Unnamed: 0,tconst,averageRating,numVotes,product_col
0,tt0111161,9.3,2307975,21464167.5
1,tt0468569,9.0,2270954,20438586.0
2,tt1375666,8.8,2033048,17890822.4
3,tt0137523,8.8,1828694,16092507.2
4,tt0944947,9.3,1728924,16078993.2
...,...,...,...,...
49995,tt0063231,5.9,1461,8619.9
49996,tt0455135,4.7,1834,8619.8
49997,tt2182427,6.6,1306,8619.6
49998,tt0062113,7.1,1214,8619.4


# Transformations
- The data needs to be truncated and transformed according to the ratings dataset and only the information corresponding to the 50k movies present in the **updated ratings needs to stay **
- There are two stages -
    - Removing unecessary columns 
    - Truncating the dataset according to the movies present in the ratings dataset

In [83]:
ids = rating['tconst']
id_dict={}
for k in ids:
    id_dict[k] = True
ids[:10]

0    tt0111161
1    tt0468569
2    tt1375666
3    tt0137523
4    tt0944947
5    tt0110912
6    tt0109830
7    tt0068646
8    tt0167260
9    tt0120737
Name: tconst, dtype: object

### Removing columns 
- **Title Basic** : *originalTitle, runtimeMinutes* to be removed 
- **Directors** : *writers* to be removed
- **People** : *birthYear and deathYear* to be removed
- **Mains**: to keep all

### title_basic
- Removing columns and truncating data
- <font color='red'>Don't execute cells again below **READ** as data has been transformed and made into a **SINGLE DATAFRAME movies** !</font>

In [99]:
title_basic = read_data("title_basic")

/storage/Recsys IMDb DATA/feather_data


In [17]:
title_basic.head()

Unnamed: 0,tconst,titleType,primaryTitle,isAdult,startYear,endYear,genres
0,tt0000001,short,Carmencita,0,1894,\N,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,0,1892,\N,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,0,1892,\N,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,0,1892,\N,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,0,1893,\N,"Comedy,Short"


In [12]:
try:
    title_basic = title_basic.drop(['originalTitle','runtimeMinutes'],axis=1)
except:
    print("Already prpped data")

In [18]:
title_basic.head()

Unnamed: 0,tconst,titleType,primaryTitle,isAdult,startYear,endYear,genres
0,tt0000001,short,Carmencita,0,1894,\N,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,0,1892,\N,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,0,1892,\N,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,0,1892,\N,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,0,1893,\N,"Comedy,Short"


In [22]:
title_basic_keep = pd.DataFrame()
data={}
for i in title_basic.index:
    movie = title_basic['tconst'][i]
    if movie in id_dict:
        # then append to the keep dataframe
        data[i] = title_basic.iloc[i]
# new thing -> making a dataframe from dictionary is much much faster than appending each row to the 
# dataframe 

In [46]:
title_basic_keep = pd.DataFrame.from_dict(data,orient='index',columns=title_basic.columns).reset_index()

In [48]:
title_basic_keep = title_basic_keep.drop('index',axis=1)
title_basic_keep

Unnamed: 0,tconst,titleType,primaryTitle,isAdult,startYear,endYear,genres
0,tt0000001,short,Carmencita,0,1894,\N,"Documentary,Short"
1,tt0000003,short,Pauvre Pierrot,0,1892,\N,"Animation,Comedy,Romance"
2,tt0000005,short,Blacksmith Scene,0,1893,\N,"Comedy,Short"
3,tt0000008,short,Edison Kinetoscopic Record of a Sneeze,0,1894,\N,"Documentary,Short"
4,tt0000010,short,Leaving the Factory,0,1895,\N,"Documentary,Short"
...,...,...,...,...,...,...,...
49995,tt9899922,tvEpisode,The Winter Line,0,2020,\N,"Drama,Mystery,Sci-Fi"
49996,tt9900092,tvSeries,Motherland: Fort Salem,0,2020,\N,"Drama,Fantasy,Sci-Fi"
49997,tt9900782,movie,Kaithi,0,2019,\N,"Action,Thriller"
49998,tt9906260,tvEpisode,Hero,0,2019,\N,"Action,Adventure,Animation"


In [49]:
title_basic_keep.to_feather("title_basic")

In [120]:
titles = read_data("title_basic")

/storage/Recsys IMDb DATA/feather_data


In [121]:
movies = titles.merge(rating,on= 'tconst')

In [123]:
movies = movies.sort_values(by="product_col",ascending=0).reset_index()

In [125]:
movies = movies.drop('index',axis=1)
movies

Unnamed: 0,tconst,titleType,primaryTitle,isAdult,startYear,endYear,genres,averageRating,numVotes,product_col
0,tt0111161,movie,The Shawshank Redemption,0,1994,\N,Drama,9.3,2307975,21464167.5
1,tt0468569,movie,The Dark Knight,0,2008,\N,"Action,Crime,Drama",9.0,2270954,20438586.0
2,tt1375666,movie,Inception,0,2010,\N,"Action,Adventure,Sci-Fi",8.8,2033048,17890822.4
3,tt0137523,movie,Fight Club,0,1999,\N,Drama,8.8,1828694,16092507.2
4,tt0944947,tvSeries,Game of Thrones,0,2011,2019,"Action,Adventure,Drama",9.3,1728924,16078993.2
...,...,...,...,...,...,...,...,...,...,...
49995,tt0063231,movie,"Live a Little, Love a Little",0,1968,\N,"Comedy,Musical,Romance",5.9,1461,8619.9
49996,tt0455135,movie,The Eye 3,0,2005,\N,"Comedy,Horror",4.7,1834,8619.8
49997,tt2182427,tvSeries,Bering Sea Gold,0,2012,\N,Reality-TV,6.6,1306,8619.6
49998,tt0062113,movie,Peppermint Frappé,0,1967,\N,"Drama,Thriller",7.1,1214,8619.4


In [126]:
movies.to_feather("movies")

## Final df is in movies in the feather data

In [129]:
%cd /notebooks/storage/Recsys IMDb DATA/feather_data
movies = feather.read_dataframe("movies")

/storage/Recsys IMDb DATA/feather_data
