In [1]:
import pandas as pd
import re

In [2]:
# Loading the Dataset
movies_df = pd.read_csv("/content/imdb_movies.csv")

In [3]:
# Inspecting the Dataset
movies_df.head()

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,English,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,English,77000000.0,340942000.0,US


In [4]:
movies_df.shape

(10178, 12)

In [5]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10178 entries, 0 to 10177
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   names       10178 non-null  object 
 1   date_x      10178 non-null  object 
 2   score       10178 non-null  float64
 3   genre       10093 non-null  object 
 4   overview    10178 non-null  object 
 5   crew        10122 non-null  object 
 6   orig_title  10178 non-null  object 
 7   status      10178 non-null  object 
 8   orig_lang   10178 non-null  object 
 9   budget_x    10178 non-null  float64
 10  revenue     10178 non-null  float64
 11  country     10178 non-null  object 
dtypes: float64(3), object(9)
memory usage: 954.3+ KB


In [6]:
movies_df.columns

Index(['names', 'date_x', 'score', 'genre', 'overview', 'crew', 'orig_title',
       'status', 'orig_lang', 'budget_x', 'revenue', 'country'],
      dtype='object')

In [7]:
# Renaming the Columns
movies_df = movies_df.rename(columns= {"names": "Movies_name",
                           "date_x" : "Release_Date",
                           "score" : "User_Rating",
                           "genre" : "Genre",
                           "overview" : "Overview",
                           "crew" : "Crew",
                           "orig_title" : "Original_Title",
                           "status" : "Status",
                           "orig_lang" : "Original_Language",
                           "budget_x" : "Budget",
                           "revenue" : "Revenue",
                           "country" : "Country_Code"
                           })

In [8]:
# List the columns label of the table
movies_df.columns

Index(['Movies_name', 'Release_Date', 'User_Rating', 'Genre', 'Overview',
       'Crew', 'Original_Title', 'Status', 'Original_Language', 'Budget',
       'Revenue', 'Country_Code'],
      dtype='object')

In [9]:
# Checking for missing Values
movies_df.isnull().sum()

Unnamed: 0,0
Movies_name,0
Release_Date,0
User_Rating,0
Genre,85
Overview,0
Crew,56
Original_Title,0
Status,0
Original_Language,0
Budget,0


In [10]:
# Cleaning Genre Column
movies_df['Genre'].unique()

array(['Drama,\xa0Action', 'Science Fiction,\xa0Adventure,\xa0Action',
       'Animation,\xa0Adventure,\xa0Family,\xa0Fantasy,\xa0Comedy', ...,
       'Adventure,\xa0Comedy,\xa0Family,\xa0Science Fiction,\xa0Action',
       'Mystery,\xa0Thriller,\xa0Comedy',
       'Action,\xa0Adventure,\xa0Science Fiction,\xa0Thriller,\xa0Horror'],
      dtype=object)

In [11]:
# Replacing \xa0 to "" in column
movies_df["Genre"] = movies_df['Genre'].str.replace("\xa0", "")

In [12]:
# Check the columns
movies_df['Genre'].unique()

array(['Drama,Action', 'Science Fiction,Adventure,Action',
       'Animation,Adventure,Family,Fantasy,Comedy', ...,
       'Adventure,Comedy,Family,Science Fiction,Action',
       'Mystery,Thriller,Comedy',
       'Action,Adventure,Science Fiction,Thriller,Horror'], dtype=object)

In [13]:
#display the rows where "Genre" column is null
movies_df.loc[movies_df['Genre'].isna()]

Unnamed: 0,Movies_name,Release_Date,User_Rating,Genre,Overview,Crew,Original_Title,Status,Original_Language,Budget,Revenue,Country_Code
305,Housewife Sex Slaves: Hatano Yui,01/09/2015,0.0,,We don't have an overview translated in Englis...,"Yui Hatano,",人妻性奴隷 波多野結衣,Released,Japanese,167540000.0,1.752700e+08,JP
1174,Beauty Rope Cosmetology,12/02/1983,10.0,,Miki is the daughter of an affluent family. Sh...,"Miki Takakura, Miki, Maya Ito, Rena, Ren Osugi...",団鬼六　美女縄化粧,Released,Japanese,201940000.0,3.815731e+07,JP
1561,Reclaim,07/29/2022,20.0,,She is a good woman living a fulfilling life. ...,,Reclaim,Released,Chinese,12001040.0,3.813901e+07,US
1762,Ancient Chinese Whorehouse,09/15/1994,50.0,,Madam Five and carpenter Kong work together ma...,"Kent Cheng, Kong, Yvonne Yung Hung, Miss Ng, S...",青樓十二房,Released,Cantonese,163600000.0,8.126672e+08,HK
1776,Porno document: Toruko tokkyû bin,02/26/1982,100.0,,Pinku from 1982.,"Jun Miho, , Rumi Kagawa, , Miyuki Oka, , Kayok...",ポルノドキュメント　トルコ特急便,Released,Japanese,201000000.0,1.569324e+09,JP
...,...,...,...,...,...,...,...,...,...,...,...,...
9626,Euphoria,11/29/2022,0.0,,Artist and filmmaker Julian Rosefeldt creates ...,"Giancarlo Esposito, Taxi Driver, Virginia Newc...",Euphoria,Released,English,167540000.0,1.752700e+08,US
9733,Fanatic,04/06/2023,0.0,,Charlie and Gerald reunite their failed 2000's...,"Matt Cook, , Caroline Rhea, , Shannon Dang, , ...",Fanatic,Released,English,167540000.0,1.752700e+08,US
10011,Perfumed Garden,06/03/2000,53.0,,"Imagine a world of pleasure, where passion is ...","Ivan Baccarat, Michael, Amy Lindsay, Lisa, Raj...",Perfumed Garden,Released,English,159000000.0,2.792784e+08,ID
10025,The Girl and the Wooden Horse Torture,12/03/1982,50.0,,Nami is a masochistic high school student who ...,"Serina Nishikawa, Nami Tsuchiya, Waka Oda, , A...",団鬼六　少女木馬責め,Released,Japanese,163600000.0,8.126672e+08,JP


In [14]:
#fill in the missing values with "Not Available" in the "Genre" column
movies_df['Genre'] = movies_df['Genre'].fillna("Not Available")

In [15]:
movies_df.isnull().sum()

Unnamed: 0,0
Movies_name,0
Release_Date,0
User_Rating,0
Genre,0
Overview,0
Crew,56
Original_Title,0
Status,0
Original_Language,0
Budget,0


In [16]:
movies_df.dtypes

Unnamed: 0,0
Movies_name,object
Release_Date,object
User_Rating,float64
Genre,object
Overview,object
Crew,object
Original_Title,object
Status,object
Original_Language,object
Budget,float64


In [17]:
# Removing Irrelevant Columns
#drop the column "Overview", "Original_Title" since it is irrelevant
movies_df.drop(["Overview", "Original_Title"], axis = 1, inplace = True)

In [18]:
movies_df.head()

Unnamed: 0,Movies_name,Release_Date,User_Rating,Genre,Crew,Status,Original_Language,Budget,Revenue,Country_Code
0,Creed III,03/02/2023,73.0,"Drama,Action","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78.0,"Science Fiction,Adventure,Action","Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76.0,"Animation,Adventure,Family,Fantasy,Comedy","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",Released,English,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70.0,"Animation,Comedy,Family,Adventure,Fantasy","Óscar Barberán, Thut (voice), Ana Esther Albor...",Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61.0,Action,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Released,English,77000000.0,340942000.0,US


In [19]:
# Clean the Column "Crew"
# Display the column "Crew"
movies_df["Crew"].unique()

array(["Michael B. Jordan, Adonis Creed, Tessa Thompson, Bianca Taylor, Jonathan Majors, Damien Anderson, Wood Harris, Tony 'Little Duke' Evers, Phylicia Rashād, Mary Anne Creed, Mila Davis-Kent, Amara Creed, Florian Munteanu, Viktor Drago, José Benavidez Jr., Felix Chavez, Selenis Leyva, Laura Chavez",
       "Sam Worthington, Jake Sully, Zoe Saldaña, Neytiri, Sigourney Weaver, Kiri / Dr. Grace Augustine, Stephen Lang, Colonel Miles Quaritch, Kate Winslet, Ronal, Cliff Curtis, Tonowari, Joel David Moore, Norm Spellman, CCH Pounder, Mo'at, Edie Falco, General Frances Ardmore",
       'Chris Pratt, Mario (voice), Anya Taylor-Joy, Princess Peach (voice), Charlie Day, Luigi (voice), Jack Black, Bowser (voice), Keegan-Michael Key, Toad (voice), Seth Rogen, Donkey Kong (voice), Fred Armisen, Cranky Kong (voice), Kevin Michael Richardson, Kamek (voice), Sebastian Maniscalco, Spike (voice)',
       ...,
       "Sean Connery, Bartholomew 'Barley' Scott Blair, Michelle Pfeiffer, Katya Orlova, R

In [20]:
#counting missing values in the column "Crew"
movies_df["Crew"].isnull().sum()

np.int64(56)

In [21]:
#fill in the missing values with "" in the column "Crew"
movies_df["Crew"] = movies_df["Crew"].fillna("")

In [22]:
movies_df["Crew"].isnull().sum()

np.int64(0)

In [23]:
movies_df.head()

Unnamed: 0,Movies_name,Release_Date,User_Rating,Genre,Crew,Status,Original_Language,Budget,Revenue,Country_Code
0,Creed III,03/02/2023,73.0,"Drama,Action","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78.0,"Science Fiction,Adventure,Action","Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76.0,"Animation,Adventure,Family,Fantasy,Comedy","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",Released,English,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70.0,"Animation,Comedy,Family,Adventure,Fantasy","Óscar Barberán, Thut (voice), Ana Esther Albor...",Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61.0,Action,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Released,English,77000000.0,340942000.0,US


In [24]:
movies_df.shape

(10178, 10)

In [25]:
movies_df.isnull().sum()

Unnamed: 0,0
Movies_name,0
Release_Date,0
User_Rating,0
Genre,0
Crew,0
Status,0
Original_Language,0
Budget,0
Revenue,0
Country_Code,0


In [26]:
# Check datatypes
movies_df.dtypes

Unnamed: 0,0
Movies_name,object
Release_Date,object
User_Rating,float64
Genre,object
Crew,object
Status,object
Original_Language,object
Budget,float64
Revenue,float64
Country_Code,object


In [27]:
# Converting the Column "User_Rating" into integer datatype
#returns the unique values of the column "User_Rating" from movies_df
movies_df["User_Rating"].unique()

array([ 73.,  78.,  76.,  70.,  61.,  66.,  80.,  83.,  59.,  58.,  55.,
        65.,  72.,  62.,  64.,  57.,  69.,  74.,  53.,  71.,  63.,  60.,
        54.,  52.,  81.,  68.,  79.,  67.,  47.,  82.,  43.,  56.,  75.,
         0.,  77.,  84.,  49.,  46.,  30., 100.,  48.,  86.,  85.,  50.,
        87.,  37.,  34.,  20.,  38.,  35.,  36.,  44.,  51.,  45.,  40.,
        27.,  10.,  22.,  90.,  42.,  25.,  92.,  33.,  28.,  29.,  41.,
        15.,  39.,  32.,  16.,  17.,  23.,  91.,  93.,  95.,  13.,  26.,
        98.,  18.])

In [28]:
#checking the datatype
movies_df["User_Rating"].dtype

dtype('float64')

In [29]:
#converting the "User_Rating" datatype to "int64"
movies_df["User_Rating"] = movies_df["User_Rating"].astype("int64")

In [30]:
#checking the datatype
movies_df["User_Rating"].dtype

dtype('int64')

In [31]:
# Clean whitespaces in crew column
# Convert to string type
movies_df['Crew'] = movies_df['Crew'].astype(str)

movies_df.dtypes

Unnamed: 0,0
Movies_name,object
Release_Date,object
User_Rating,int64
Genre,object
Crew,object
Status,object
Original_Language,object
Budget,float64
Revenue,float64
Country_Code,object


In [32]:
# Strip leading and trailing whitespace from each cell
movies_df['Crew'] = movies_df['Crew'].str.strip()

# Replace all multiple internal spaces with a single space
movies_df['Crew'] = movies_df['Crew'].str.replace(r'\s+', ' ', regex=True)

In [33]:
movies_df.head()

Unnamed: 0,Movies_name,Release_Date,User_Rating,Genre,Crew,Status,Original_Language,Budget,Revenue,Country_Code
0,Creed III,03/02/2023,73,"Drama,Action","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78,"Science Fiction,Adventure,Action","Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76,"Animation,Adventure,Family,Fantasy,Comedy","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",Released,English,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70,"Animation,Comedy,Family,Adventure,Fantasy","Óscar Barberán, Thut (voice), Ana Esther Albor...",Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61,Action,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Released,English,77000000.0,340942000.0,US


In [34]:
# Convert the 'Genre' column to lowercase
movies_df['Genre'] = movies_df['Genre'].str.lower()

# Print dataframe
movies_df.head()

Unnamed: 0,Movies_name,Release_Date,User_Rating,Genre,Crew,Status,Original_Language,Budget,Revenue,Country_Code
0,Creed III,03/02/2023,73,"drama,action","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78,"science fiction,adventure,action","Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76,"animation,adventure,family,fantasy,comedy","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",Released,English,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70,"animation,comedy,family,adventure,fantasy","Óscar Barberán, Thut (voice), Ana Esther Albor...",Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61,action,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Released,English,77000000.0,340942000.0,US


In [35]:
# Create CSV File of Cleaned dataset
movies_df.to_csv('IMBD-Movies-Cleaned.csv', index=False)

In [36]:
# Load and read the cleaned data
df = pd.read_csv("/content/IMBD-Movies-Cleaned.csv")
df.head()

Unnamed: 0,Movies_name,Release_Date,User_Rating,Genre,Crew,Status,Original_Language,Budget,Revenue,Country_Code
0,Creed III,03/02/2023,73,"drama,action","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78,"science fiction,adventure,action","Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76,"animation,adventure,family,fantasy,comedy","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",Released,English,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70,"animation,comedy,family,adventure,fantasy","Óscar Barberán, Thut (voice), Ana Esther Albor...",Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61,action,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Released,English,77000000.0,340942000.0,US
