Importing Necessary Dependencies

---

In [1]:
import os
os.chdir("../..")  # go up from notebooks/ to project root

import pandas as pd  # noqa: E402
from utils.langauge_code import get_language_name  # noqa: E402

In [2]:
SAVE_DATA_PATH = "./data/processed/preprocessed_tmdb_data.csv"

Loading our data

---

In [3]:
BASE_DATA_PATH = "./data/raw/"
TMDB_DATA_PATH = BASE_DATA_PATH + "tmdb-data.csv"

In [4]:
data = pd.read_csv(TMDB_DATA_PATH)
print(data.head())

       id            title  vote_average  vote_count    status release_date  \
0   27205        Inception         8.364       34495  Released   2010-07-15   
1  157336     Interstellar         8.417       32571  Released   2014-11-05   
2     155  The Dark Knight         8.512       30619  Released   2008-07-16   
3   19995           Avatar         7.573       29815  Released   2009-12-15   
4   24428     The Avengers         7.710       29166  Released   2012-04-25   

      revenue  runtime  adult                     backdrop_path  ...  \
0   825532764      148  False  /8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg  ...   
1   701729206      169  False  /pbrkL804c8yAv3zBZR4QPEafpAR.jpg  ...   
2  1004558444      152  False  /nMKdUUepR0i5zn0y1T4CsSB5chy.jpg  ...   
3  2923706026      162  False  /vL5LR6WdxWPjLPFRLe133jXWsh5.jpg  ...   
4  1518815515      143  False  /9BBTo63ANSmhC4e6r62OJFuK2GL.jpg  ...   

    original_title                                           overview  \
0        Inception 

Analysis

---

In [5]:
data.sample(n=1).T

Unnamed: 0,150444
id,926245
title,Full dekning
vote_average,7.667
vote_count,3
status,Released
release_date,2022-02-25
revenue,0
runtime,102
adult,False
backdrop_path,/Ak8IcmAXdMVmWTTw7tFN2adOvr1.jpg


In [6]:
rows, cols = data.shape
print(f"Number of rows: {rows}")
print(f"Number of columns: {cols}")

Number of rows: 1269226
Number of columns: 24


In [7]:
data.info()

<class 'pandas.DataFrame'>
RangeIndex: 1269226 entries, 0 to 1269225
Data columns (total 24 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1269226 non-null  int64  
 1   title                 1269213 non-null  str    
 2   vote_average          1269226 non-null  float64
 3   vote_count            1269226 non-null  int64  
 4   status                1269226 non-null  str    
 5   release_date          1023988 non-null  str    
 6   revenue               1269226 non-null  int64  
 7   runtime               1269226 non-null  int64  
 8   adult                 1269226 non-null  bool   
 9   backdrop_path         326879 non-null   str    
 10  budget                1269226 non-null  int64  
 11  homepage              132450 non-null   str    
 12  imdb_id               634125 non-null   str    
 13  original_language     1269226 non-null  str    
 14  original_title        1269213 non-null  str  

In [8]:
null_count = data.isnull().sum()
null_percentage = data.isnull().mean() * 100
null_summary = pd.DataFrame(
    {"Null Count": null_count, "Null Percentage (%)": null_percentage}
).sort_values(by="Null Percentage (%)", ascending=False)
null_summary

Unnamed: 0,Null Count,Null Percentage (%)
homepage,1136776,89.564506
tagline,1091671,86.010766
backdrop_path,942347,74.2458
keywords,941129,74.149836
production_companies,712128,56.107265
imdb_id,635101,50.038449
production_countries,590100,46.492902
spoken_languages,566801,44.657216
genres,534685,42.126855
poster_path,425530,33.526732


Feature Selection

---

In [9]:
data.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords'],
      dtype='str')

In [10]:
drop_cols = [
    "homepage",
    "backdrop_path",
    "spoken_languages",
    "tagline",
    "genres",
    "production_countries",
]

data = data.drop(columns=drop_cols)
print(data.head())

       id            title  vote_average  vote_count    status release_date  \
0   27205        Inception         8.364       34495  Released   2010-07-15   
1  157336     Interstellar         8.417       32571  Released   2014-11-05   
2     155  The Dark Knight         8.512       30619  Released   2008-07-16   
3   19995           Avatar         7.573       29815  Released   2009-12-15   
4   24428     The Avengers         7.710       29166  Released   2012-04-25   

      revenue  runtime  adult     budget    imdb_id original_language  \
0   825532764      148  False  160000000  tt1375666                en   
1   701729206      169  False  165000000  tt0816692                en   
2  1004558444      152  False  185000000  tt0468569                en   
3  2923706026      162  False  237000000  tt0499549                en   
4  1518815515      143  False  220000000  tt0848228                en   

    original_title                                           overview  \
0        Ince

Data Cleaning

---

In [11]:
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

rows, cols = data.shape
print("After dropping missing values:")
print(f"Number of rows: {rows}")
print(f"Number of columns: {cols}")

After dropping missing values:
Number of rows: 141430
Number of columns: 18


In [12]:
if data.duplicated().sum() == 0:
    print("No duplicate rows found.")
else:
    print(f"Duplicate rows found : {data.duplicated().sum()}")
    data.drop_duplicates(inplace=True)
    data.reset_index(drop=True, inplace=True)
    print("Duplicate rows removed.")

Duplicate rows found : 4
Duplicate rows removed.


In [13]:
rows, cols = data.shape
print("After removing duplicates:")
print(f"Number of rows: {rows}")
print(f"Number of columns: {cols}")

After removing duplicates:
Number of rows: 141426
Number of columns: 18


Data Transformation

---

In [14]:
print(data.head())

       id            title  vote_average  vote_count    status release_date  \
0   27205        Inception         8.364       34495  Released   2010-07-15   
1  157336     Interstellar         8.417       32571  Released   2014-11-05   
2     155  The Dark Knight         8.512       30619  Released   2008-07-16   
3   19995           Avatar         7.573       29815  Released   2009-12-15   
4   24428     The Avengers         7.710       29166  Released   2012-04-25   

      revenue  runtime  adult     budget    imdb_id original_language  \
0   825532764      148  False  160000000  tt1375666                en   
1   701729206      169  False  165000000  tt0816692                en   
2  1004558444      152  False  185000000  tt0468569                en   
3  2923706026      162  False  237000000  tt0499549                en   
4  1518815515      143  False  220000000  tt0848228                en   

    original_title                                           overview  \
0        Ince

In [15]:
# Convert 'id' to string before concatenation
data["movie_id"] = data["id"].astype(str) + "_" + data["imdb_id"]
data["original_language"] = data["original_language"].apply(get_language_name)
# Drop the original columns if needed
data = data.drop(columns=["id", "imdb_id"])
print(data.head())

             title  vote_average  vote_count    status release_date  \
0        Inception         8.364       34495  Released   2010-07-15   
1     Interstellar         8.417       32571  Released   2014-11-05   
2  The Dark Knight         8.512       30619  Released   2008-07-16   
3           Avatar         7.573       29815  Released   2009-12-15   
4     The Avengers         7.710       29166  Released   2012-04-25   

      revenue  runtime  adult     budget original_language   original_title  \
0   825532764      148  False  160000000           English        Inception   
1   701729206      169  False  165000000           English     Interstellar   
2  1004558444      152  False  185000000           English  The Dark Knight   
3  2923706026      162  False  237000000           English           Avatar   
4  1518815515      143  False  220000000           English     The Avengers   

                                            overview  popularity  \
0  Cobb, a skilled thief who c

Save the preprocessed data

---

In [16]:
data.to_csv(SAVE_DATA_PATH, index=False)