In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [25]:
df = pd.read_csv('/content/IMDbMovies-Clean.csv')
df.head()

Unnamed: 0,Title,Summary,Director,Writer,Main Genres,Motion Picture Rating,Release Year,Runtime (Minutes),Rating (Out of 10),Number of Ratings (in thousands),Budget (in millions),Gross in US & Canada (in millions),Gross worldwide (in millions),Opening Weekend in US & Canada,Gross Opening Weekend (in millions)
0,Napoleon,An epic that details the checkered rise and fa...,Ridley Scott,David Scarpa,"Action,Adventure,Biography",R,2023.0,158.0,6.7,38.0,,37.514,84.968,11.26.2023,20.639
1,The Hunger Games: The Ballad of Songbirds & Sn...,Coriolanus Snow mentors and develops feelings ...,Francis Lawrence,"Michael Lesslie,Michael Arndt,Suzanne Collins","Action,Adventure,Drama",PG-13,2023.0,157.0,7.2,37.0,100.0,105.043,191.729,11.19.2023,44.607
2,The Killer,"After a fateful near-miss, an assassin battles...",David Fincher,"Andrew Kevin Walker,Luc Jacamon,Alexis Nolent","Action,Adventure,Crime",R,2023.0,118.0,6.8,117.0,,,0.421,,
3,Leo,A 74-year-old lizard named Leo and his turtle ...,"David Wachtenheim,Robert Smigel,Robert Marianetti","Paul Sado,Robert Smigel,Adam Sandler","Animation,Comedy,Family",PG,2023.0,102.0,7.0,10.0,,,,,
4,Thanksgiving,"After a Black Friday riot ends in tragedy, a m...",Eli Roth,"Eli Roth,Jeff Rendell","Horror,Mystery,Thriller",R,2023.0,106.0,7.0,9.1,,25.409,29.667,11.19.2023,10.306


In [26]:
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9083 entries, 0 to 9082
Data columns (total 15 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Title                                9083 non-null   object 
 1   Summary                              9083 non-null   object 
 2   Director                             9052 non-null   object 
 3   Writer                               8759 non-null   object 
 4   Main Genres                          9076 non-null   object 
 5   Motion Picture Rating                8285 non-null   object 
 6   Release Year                         9076 non-null   float64
 7   Runtime (Minutes)                    8918 non-null   float64
 8   Rating (Out of 10)                   8813 non-null   float64
 9   Number of Ratings (in thousands)     8813 non-null   float64
 10  Budget (in millions)                 5879 non-null   float64
 11  Gross in US & Canada (in milli

(9083, 15)

we will now drop the columns after main genre as all those columns are not relevant for our project.

In [27]:

columns_to_drop = ['Motion Picture Rating', 'Release Year', 'Runtime (Minutes)', 'Rating (Out of 10)', 'Number of Ratings (in thousands)','Budget (in millions)','Gross in US & Canada (in millions)', 'Gross worldwide (in millions)', 'Opening Weekend in US & Canada','Gross Opening Weekend (in millions)']
df = df.drop(columns=columns_to_drop, axis=1)

print("Remaining columns:\n", df.columns)


Remaining columns:
 Index(['Title', 'Summary', 'Director', 'Writer', 'Main Genres'], dtype='object')


data has a lot of null values, so depending on the column type, we need to apply data imputation to fill them, since we can't drop all of them.

Data imputation plan :

* We can replace the string columns by an empty string, so that when we concatenate multiple columns it gets ingnored and we don't have to drop the whole row.

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9083 entries, 0 to 9082
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        9083 non-null   object
 1   Summary      9083 non-null   object
 2   Director     9052 non-null   object
 3   Writer       8759 non-null   object
 4   Main Genres  9076 non-null   object
dtypes: object(5)
memory usage: 354.9+ KB


In [29]:
print(df.isnull().sum())


Title            0
Summary          0
Director        31
Writer         324
Main Genres      7
dtype: int64


In [30]:
df['Writer'] = df['Writer'].fillna('')
print(df['Writer'].isnull().sum())

df['Director'] = df['Director'].fillna('')
print(df['Director'].isnull().sum())


df['Main Genres'] = df['Main Genres'].fillna('')
print(df['Main Genres'].isnull().sum())

print(df.isnull().sum())


0
0
0
Title          0
Summary        0
Director       0
Writer         0
Main Genres    0
dtype: int64


Data is processed and all the null values are removed from the dataset.

We will now perform text-vectorization using NLP techniques such as BOW or TF-IDF.

### Cleaned Data

In [31]:
data = df.copy()

In [32]:
data['metadata'] = data['Summary']+ " " + data['Writer'] + " " + data['Director'] + " " + data['Main Genres']

In [33]:
data.drop(['Summary', "Writer", "Director", "Main Genres"], axis=1, inplace=True)

In [34]:
data.loc[0].metadata

'An epic that details the checkered rise and fall of French Emperor Napoleon Bonaparte and his relentless journey to power through the prism of his addictive, volatile relationship with his wife, Josephine. David Scarpa Ridley Scott Action,Adventure,Biography'

In [35]:
data.head()

Unnamed: 0,Title,metadata
0,Napoleon,An epic that details the checkered rise and fa...
1,The Hunger Games: The Ballad of Songbirds & Sn...,Coriolanus Snow mentors and develops feelings ...
2,The Killer,"After a fateful near-miss, an assassin battles..."
3,Leo,A 74-year-old lizard named Leo and his turtle ...
4,Thanksgiving,"After a Black Friday riot ends in tragedy, a m..."


### Data cleaning, NLP pipeline (lemmatization, tokenization)

In [36]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text_with_regex(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])

    return text


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
data['metadata'] = data['metadata'].apply(preprocess_text_with_regex)

In [38]:
data['Title'] = data['Title'].apply(lambda x: x.lower())

In [39]:
data.head()

Unnamed: 0,Title,metadata
0,napoleon,epic detail checkered rise fall french emperor...
1,the hunger games: the ballad of songbirds & sn...,coriolanus snow mentor develops feeling female...
2,the killer,fateful nearmiss assassin battle employer inte...
3,leo,yearold lizard named leo turtle friend decide ...
4,thanksgiving,black friday riot end tragedy mysterious thank...


In [40]:
data.to_csv('final_data.csv')