## Data Preprocessing to get Final Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
#loading dataset
credits_df=pd.read_csv("credits.csv")
titles_df=pd.read_csv("titles.csv")

print(f"The credits data has {credits_df.shape[0]} rows and {credits_df.shape[1]} columns")
print(f"The titles data has {titles_df.shape[0]} rows and {titles_df.shape[1]} columns")

The credits data has 77213 rows and 5 columns
The titles data has 5806 rows and 15 columns


In [3]:
credits_df.sample(5)

Unnamed: 0,person_id,id,name,character,role
18366,21334,tm181270,Jodi Lyn O'Keefe,Chelle Ringell,ACTOR
56875,1562362,tm925785,Mauricio Aspe,,ACTOR
41595,1185475,tm411503,Antonia Gentry,,ACTOR
49380,282177,tm460124,Alisha Boe,Nina,ACTOR
42807,870491,tm372061,Hikmathulla,Journalist,ACTOR


In [4]:
titles_df.sample(5)

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
2604,tm233484,Duck Duck Goose,MOVIE,"After he’s grounded by an injury, a high-flyin...",2018,PG,82,"['comedy', 'family', 'animation']",['CN'],,tt4940416,5.7,3686.0,17.258,6.2
1112,tm149747,The Perfect Dictatorship,MOVIE,"TV MX, the most powerful Mexican Television Co...",2014,,143,"['comedy', 'drama']",['MX'],,tt3970854,7.2,5370.0,10.475,7.3
3233,tm453862,The Call,MOVIE,Connected by phone in the same home but 20 yea...,2020,,112,"['thriller', 'scifi', 'drama', 'crime', 'horror']",['KR'],,tt10530176,7.1,29450.0,34.559,7.6
1971,ts86169,The Innocent Man,SHOW,In a story that gained national attention with...,2018,TV-MA,47,"['documentation', 'crime']",['US'],1.0,tt0914376,7.3,5883.0,7.434,7.0
5668,tm1024062,Nayattu,MOVIE,Three police officers are forced to go on the ...,2021,,124,"['drama', 'thriller', 'crime']",['IN'],,tt11604676,8.1,6780.0,4.151,7.6


## Data Wrangling

Visually asessing the credits data , we can see it contains different actor and director names for the same id

So I will be removing duplicates based on this id so that it will be easier to merge

In [5]:
credits_df['id'].duplicated().sum()

71779

In [6]:
len(credits_df.drop_duplicates(subset='id'))

5434

In [7]:
new_credits_df=credits_df[['id','name']].drop_duplicates(subset='id')

In [8]:
new_credits_df.sample(5)

Unnamed: 0,id,name
38292,tm312767,Katy Townsend
38626,tm314829,Penelope Ann Miller
73707,ts297065,Miryam Lumpini
60743,tm475162,Hong Chau
69182,tm1107225,Clara Galle


In [9]:
#merging the titles and new credits data together
final_df=pd.merge(titles_df,new_credits_df ,on='id')

In [10]:
final_df.sample(5)

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,name
382,tm31189,The Spiderwick Chronicles,MOVIE,Upon moving into the run-down Spiderwick Estat...,2008,PG,96,"['fantasy', 'thriller', 'action', 'drama', 'fa...",['US'],,tt0416236,6.5,94628.0,30.838,6.6,Sarah Bolger
3842,tm470378,ReMastered: Devil at the Crossroads,MOVIE,Robert Johnson was one of the most influential...,2019,,48,"['documentation', 'music']",['US'],,tt9046574,7.0,2691.0,7.171,6.9,Keith Richards
1812,tm266666,Gerald's Game,MOVIE,"When her husband's sex game goes wrong, Jessie...",2017,,103,"['horror', 'thriller', 'drama']",['US'],,tt3748172,6.5,104894.0,19.167,6.4,Carla Gugino
783,tm177958,Pyaar Ka Punchnama,MOVIE,Nishant starts dating Charu while his roommate...,2011,PG-13,149,"['romance', 'comedy', 'drama', 'european']",['IN'],,tt1926313,7.7,21204.0,5.152,7.0,Esen Işık
4637,tm1022103,Outlaws,MOVIE,Introverted Girona student Nacho meets two del...,2021,R,125,"['crime', 'thriller', 'action']",['ES'],,tt11892272,6.9,3722.0,96.602,7.0,Marcos Ruiz


In [11]:
print(f"The final merged data has {final_df.shape[0]} rows and {final_df.shape[1]} columns")

The final merged data has 5434 rows and 16 columns


In [12]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5434 entries, 0 to 5433
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    5434 non-null   object 
 1   title                 5433 non-null   object 
 2   type                  5434 non-null   object 
 3   description           5424 non-null   object 
 4   release_year          5434 non-null   int64  
 5   age_certification     2951 non-null   object 
 6   runtime               5434 non-null   int64  
 7   genres                5434 non-null   object 
 8   production_countries  5434 non-null   object 
 9   seasons               1776 non-null   float64
 10  imdb_id               5024 non-null   object 
 11  imdb_score            4963 non-null   float64
 12  imdb_votes            4949 non-null   float64
 13  tmdb_popularity       5432 non-null   float64
 14  tmdb_score            5259 non-null   float64
 15  name                 

##changing series to dataframe and saving in folder

gen_series=final_df['genres'].apply(eval).explode().value_counts()
pd.DataFrame(gen_series).rename(columns={'Unnamed: 0':'genre','genres':'counts'}).to_csv('gen.csv')

pdt_series=final_df['production_countries'].apply(eval).explode().value_counts()
pd.DataFrame(pdt_series).rename(columns={'Unnamed: 0':'production_countries','production_countries':'counts'}).to_csv('pdt.csv')


In [13]:
#checking for nulls 
null=[]
col_=[]
null_per=[]
dict={}
for col in final_df.columns:
    null_count=final_df[col].isnull().sum()
    
    if null_count>0:
        col_.append(col)
        null.append(null_count)
        nullpercent=(round(null_count*100/len(final_df)))
        null_per.append(nullpercent)
        dict['column']=col_
        dict['null_count']=null
        dict['null_percent(%)']=null_per
        null_df=pd.DataFrame(dict)
        #print(col,null_count ,str(round(null_count*100/len(final_df)))+ "%" )

null_df

Unnamed: 0,column,null_count,null_percent(%)
0,title,1,0
1,description,10,0
2,age_certification,2483,46
3,seasons,3658,67
4,imdb_id,410,8
5,imdb_score,471,9
6,imdb_votes,485,9
7,tmdb_popularity,2,0
8,tmdb_score,175,3


In this section, we will be reviewing the columns with missing values with the end goal being to remove nulls from the dataset


In [14]:
final_df.query("type == 'MOVIE'").shape[0]

3658

In [15]:
#checking each column to check for source of nulls
final_df.query("type == 'MOVIE'").seasons.isnull().sum()*100/final_df.query("type == 'MOVIE'").shape[0]

100.0

All movies in the dataset had 100% nulls in the seasons column which is expected as movies tended to not be seasonal

We will be filling missing values with 0 instead of dropping them.

In [16]:
final_df['seasons']=final_df['seasons'].fillna(0)

In [17]:
#filling nulls in the age certification column with NR which means Not Rated
final_df['age_certification']=final_df['age_certification'].fillna("NR")

In [18]:
final_df.drop(columns=['imdb_id'],inplace=True)

Filling missing numerical columns with the mean 

In [19]:
cols=['imdb_score','imdb_votes','tmdb_popularity','tmdb_score']
for col in cols:
    mean=final_df[col].mean().round(2)
    final_df[col]=final_df[col].fillna(mean)

Filling the remaining missing values with empty string

In [20]:
final_df=final_df.fillna('')

In [21]:
final_df.duplicated().sum()

0

In [22]:
print(f"The final cleaned data has {final_df.shape[0]} rows and {final_df.shape[1]} columns")

The final cleaned data has 5434 rows and 15 columns


In [23]:
#cleaning the genres and production companies column 
def clean(col):
    new_column=[]

    for line in final_df[col]:
        cols=str(line).replace("['",'').replace("']",'').replace("'",'').replace(',','').strip()
        new_column.append(cols)

    final_df[col]=new_column

In [24]:
clean('genres')
clean('production_countries')

In [25]:
final_df.sample(5)

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,name
2739,tm285712,Jen Kirkman: Just Keep Livin'?,MOVIE,Incisive comic Jen Kirkman gets real about wom...,2017,NR,69,comedy,US,0.0,6.7,1281.0,3.985,6.6,Jen Kirkman
930,tm165521,Redemption,MOVIE,Homeless and on the run from a military court ...,2013,R,100,thriller action european,US GB,0.0,6.53,24675.55,59.493,6.2,Jason Statham
307,tm94606,Let Me In,MOVIE,A bullied young boy befriends a young female v...,2010,R,116,horror drama fantasy thriller european,GB US,0.0,7.1,120208.0,24.49,6.8,Kodi Smit-McPhee
3382,tm948128,Audrey,MOVIE,"An intimate look at Audrey Hepburn's life, wit...",2020,PG,100,documentation history,CH US,0.0,7.1,1867.0,6.316,7.1,Audrey Hepburn
3728,tm502151,A Remarkable Tale,MOVIE,A town at the edge of the missing take despera...,2019,PG,93,comedy drama european,ES,0.0,5.3,1159.0,5.253,5.7,Carmen Machi


In [26]:
final_df.describe()

Unnamed: 0,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
count,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0,5434.0
mean,2015.929886,79.869157,0.730217,6.528051,24675.55,23.344594,6.792427
std,7.410901,38.979547,1.894051,1.101597,85309.78,70.400573,1.119633
min,1953.0,0.0,0.0,1.5,5.0,0.6,1.0
25%,2015.0,46.0,0.0,5.9,716.0,3.33125,6.1
50%,2018.0,87.0,0.0,6.53,3320.5,7.861,6.8
75%,2020.0,106.0,1.0,7.3,21508.5,18.57125,7.5
max,2022.0,251.0,42.0,9.5,2268288.0,1823.374,10.0


In [27]:
#creating a copy of data to prepare for recommendeer system
model_df=final_df.copy()


In [28]:
model_df.sample(5)

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,name
3692,ts250800,Kissing Game,SHOW,"At a high school in a rural, isolated ranching...",2020,NR,40,drama,BR,1.0,6.53,24675.55,45.216,7.1,Iza Moreira
3721,tm319139,Saaho,MOVIE,A battle for power ensues as warring gangters ...,2019,R,171,thriller action crime drama romance,IN,0.0,5.5,18554.0,5.566,6.2,Prabhas
1212,ts42687,PJ Masks,SHOW,"Connor, Greg and Amaya are normal kids by day,...",2015,TV-G,15,animation family fantasy scifi action european,GB,5.0,5.4,1214.0,54.603,4.9,Jacob Ewaniuk
2880,tm445258,Marriage Palace,MOVIE,Marriage Palace is a comedy movie which takes ...,2018,NR,102,comedy,XX,0.0,6.2,180.0,1.4,7.0,Sharry Maan
2923,tm450512,Pinky Memsaab,MOVIE,The lives of a gullible maid; a beautiful soci...,2018,NR,124,drama family,PK,0.0,6.3,521.0,1.235,9.0,Hajra Yamin


In [29]:
#applying title case to the title column
model_df['title']=model_df['title'].apply(lambda x:x.title())

In [30]:
#creating a column that combines important features 

model_df['tags']=model_df['genres'] + " " +model_df['description']+ ' '+ model_df['production_countries'] + " " +model_df['type']+" " +model_df['age_certification']+ ' '+ model_df['name']

In [1]:
import pandas as pd
df=pd.read_csv('final_data.csv').sort_values(by=['title'])

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,name,tags
0,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,113,crime drama,US,0.0,8.30,795222.00,27.612,8.20,Robert De Niro,crime drama A mentally unstable Vietnam War ve...
1,tm127384,Monty Python And The Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,comedy fantasy,GB,0.0,8.20,530877.00,18.216,7.80,Graham Chapman,"comedy fantasy King Arthur, accompanied by his..."
2,tm70993,Life Of Brian,MOVIE,"Brian Cohen is an average young Jewish man, bu...",1979,R,94,comedy,GB,0.0,8.00,392419.00,17.505,7.80,Graham Chapman,comedy Brian Cohen is an average young Jewish ...
3,tm190788,The Exorcist,MOVIE,12-year-old Regan MacNeil begins to adapt an e...,1973,R,133,horror,US,0.0,8.10,391942.00,95.337,7.70,Ellen Burstyn,horror 12-year-old Regan MacNeil begins to ada...
4,ts22164,Monty Python'S Flying Circus,SHOW,A British sketch comedy series with the shows ...,1969,TV-14,30,comedy european,GB,4.0,8.80,72895.00,12.919,8.30,Graham Chapman,comedy european A British sketch comedy series...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5429,tm1040816,Momshies! Your Soul Is Mine,MOVIE,Three women with totally different lives accid...,2021,NR,108,comedy,PH,0.0,5.80,26.00,4.112,6.79,Jolina Magdangal,comedy Three women with totally different live...
5430,tm1014599,Fine Wine,MOVIE,A beautiful love story that can happen between...,2021,NR,100,romance drama,NG,0.0,6.90,39.00,0.966,6.79,Richard Mofe-Damijo,romance drama A beautiful love story that can ...
5431,tm1108171,Edis Starlight,MOVIE,Rising star Edis's career journey with ups and...,2021,NR,74,music documentation,[],0.0,6.53,24675.55,1.036,8.50,Edis Görgülü,music documentation Rising star Edis's career ...
5432,tm1045018,Clash,MOVIE,A man from Nigeria returns to his family in Ca...,2021,NR,88,family drama,NG CA,0.0,6.50,32.00,0.709,6.79,Omoni Oboli,family drama A man from Nigeria returns to his...
