# Data Preparation and Cleaning


In [1]:
#import libraries
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore') # if there are any warning due to version mismatch, it will be ignored

In [3]:
#load dataset
IMDB_df = pd.read_csv('imdb.csv')
IMDB_df

Unnamed: 0,Name,Date,Rate,Votes,Genre,Duration,Type,Certificate,Episodes,Nudity,Violence,Profanity,Alcohol,Frightening
0,No Time to Die,2021,7.6,107163,"Action, Adventure, Thriller",163,Film,PG-13,-,Mild,Moderate,Mild,Mild,Moderate
1,The Guilty,2021,6.3,64375,"Crime, Drama, Thriller",90,Film,R,-,,,Severe,,Moderate
2,The Many Saints of Newark,2021,6.4,27145,"Crime, Drama",120,Film,R,-,Moderate,Severe,Severe,Moderate,Moderate
3,Venom: Let There Be Carnage,2021,6.4,30443,"Action, Adventure, Sci-Fi",97,Film,PG-13,-,,Moderate,Moderate,Mild,Moderate
4,Dune,2021,8.3,84636,"Action, Adventure, Drama",155,Film,PG-13,-,,Moderate,,Mild,Moderate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6173,The Human Centipede II (Full Sequence),2011,3.8,37492,Horror,91,Film,Not Rated,-,Severe,Severe,Severe,Mild,Severe
6174,Double Indemnity,1944,8.3,150448,"Crime, Drama, Film-Noir",107,Film,Passed,-,,Mild,,Mild,Mild
6175,Before the Devil Knows You're Dead,2007,7.3,100668,"Crime, Drama, Thriller",117,Film,R,-,Severe,Moderate,Severe,Severe,Severe
6176,Queen Bees,2021,6.0,887,"Comedy, Drama, Romance",100,Film,PG-13,-,,,Mild,Moderate,


In [4]:
IMDB_df.shape

(6178, 14)

In [5]:
IMDB_df.head()

Unnamed: 0,Name,Date,Rate,Votes,Genre,Duration,Type,Certificate,Episodes,Nudity,Violence,Profanity,Alcohol,Frightening
0,No Time to Die,2021,7.6,107163,"Action, Adventure, Thriller",163,Film,PG-13,-,Mild,Moderate,Mild,Mild,Moderate
1,The Guilty,2021,6.3,64375,"Crime, Drama, Thriller",90,Film,R,-,,,Severe,,Moderate
2,The Many Saints of Newark,2021,6.4,27145,"Crime, Drama",120,Film,R,-,Moderate,Severe,Severe,Moderate,Moderate
3,Venom: Let There Be Carnage,2021,6.4,30443,"Action, Adventure, Sci-Fi",97,Film,PG-13,-,,Moderate,Moderate,Mild,Moderate
4,Dune,2021,8.3,84636,"Action, Adventure, Drama",155,Film,PG-13,-,,Moderate,,Mild,Moderate


In [6]:
IMDB_df.columns

Index(['Name', 'Date', 'Rate', 'Votes', 'Genre', 'Duration', 'Type',
       'Certificate', 'Episodes', 'Nudity', 'Violence', 'Profanity', 'Alcohol',
       'Frightening'],
      dtype='object')

In [7]:
IMDB_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6178 entries, 0 to 6177
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         6178 non-null   object
 1   Date         6178 non-null   int64 
 2   Rate         6178 non-null   object
 3   Votes        6178 non-null   object
 4   Genre        6178 non-null   object
 5   Duration     6178 non-null   object
 6   Type         6178 non-null   object
 7   Certificate  6178 non-null   object
 8   Episodes     6178 non-null   object
 9   Nudity       6178 non-null   object
 10  Violence     6178 non-null   object
 11  Profanity    6178 non-null   object
 12  Alcohol      6178 non-null   object
 13  Frightening  6178 non-null   object
dtypes: int64(1), object(13)
memory usage: 675.8+ KB


In [8]:
'''
Used for calculating some statistical data like percentile,
mean and std of the numerical values
'''
IMDB_df.describe()

Unnamed: 0,Date
count,6178.0
mean,2005.952574
std,15.910248
min,1922.0
25%,1998.0
50%,2011.0
75%,2019.0
max,2023.0


In [9]:
'''
This will include count, unique, top and freq. 
The top is the most common value. 
The freq is the most common value’s frequency.
'''
IMDB_df.describe(include = 'object')

Unnamed: 0,Name,Rate,Votes,Genre,Duration,Type,Certificate,Episodes,Nudity,Violence,Profanity,Alcohol,Frightening
count,6178,6178.0,6178,6178,6178,6178,6178,6178,6178,6178,6178,6178,6178
unique,4820,77.0,4802,377,203,2,23,284,5,5,5,5,5
top,King Kong,7.3,No Votes,Comedy,60,Film,R,-,Mild,Moderate,Mild,Mild,Moderate
freq,5,273.0,185,268,352,4446,1885,4446,2292,1814,2077,3257,1969


In [10]:
IMDB_df.Name=='King Kong'

0       False
1       False
2       False
3       False
4       False
        ...  
6173    False
6174    False
6175    False
6176    False
6177    False
Name: Name, Length: 6178, dtype: bool

In [14]:
KingKong_data = IMDB_df.loc[IMDB_df.Name=='King Kong'] 
KingKong_data

Unnamed: 0,Name,Date,Rate,Votes,Genre,Duration,Type,Certificate,Episodes,Nudity,Violence,Profanity,Alcohol,Frightening
2084,King Kong,2005,7.2,407876,"Action, Adventure, Drama",187,Film,PG-13,-,Mild,Moderate,Mild,Mild,Moderate
4207,King Kong,1976,5.9,31536,"Adventure, Horror",134,Film,PG,-,Mild,Moderate,Mild,Mild,Moderate
4951,King Kong,1933,7.9,82705,"Adventure, Horror, Sci-Fi",100,Film,Passed,-,Mild,Moderate,,Mild,Moderate
5357,King Kong,1976,5.9,31536,"Adventure, Horror",134,Film,PG,-,Mild,Moderate,Mild,Mild,Moderate
6101,King Kong,1933,7.9,82705,"Adventure, Horror, Sci-Fi",100,Film,Passed,-,Mild,Moderate,,Mild,Moderate


In [15]:
IMDB_df = IMDB_df.drop_duplicates()
IMDB_df

Unnamed: 0,Name,Date,Rate,Votes,Genre,Duration,Type,Certificate,Episodes,Nudity,Violence,Profanity,Alcohol,Frightening
0,No Time to Die,2021,7.6,107163,"Action, Adventure, Thriller",163,Film,PG-13,-,Mild,Moderate,Mild,Mild,Moderate
1,The Guilty,2021,6.3,64375,"Crime, Drama, Thriller",90,Film,R,-,,,Severe,,Moderate
2,The Many Saints of Newark,2021,6.4,27145,"Crime, Drama",120,Film,R,-,Moderate,Severe,Severe,Moderate,Moderate
3,Venom: Let There Be Carnage,2021,6.4,30443,"Action, Adventure, Sci-Fi",97,Film,PG-13,-,,Moderate,Moderate,Mild,Moderate
4,Dune,2021,8.3,84636,"Action, Adventure, Drama",155,Film,PG-13,-,,Moderate,,Mild,Moderate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5023,The Human Centipede II (Full Sequence),2011,3.8,37492,Horror,91,Film,Not Rated,-,Severe,Severe,Severe,Mild,Severe
5024,Double Indemnity,1944,8.3,150448,"Crime, Drama, Film-Noir",107,Film,Passed,-,,Mild,,Mild,Mild
5025,Before the Devil Knows You're Dead,2007,7.3,100668,"Crime, Drama, Thriller",117,Film,R,-,Severe,Moderate,Severe,Severe,Severe
5026,Queen Bees,2021,6.0,887,"Comedy, Drama, Romance",100,Film,PG-13,-,,,Mild,Moderate,


In [16]:
KingKong_data = IMDB_df.loc[IMDB_df.Name=='King Kong'] 
KingKong_data

Unnamed: 0,Name,Date,Rate,Votes,Genre,Duration,Type,Certificate,Episodes,Nudity,Violence,Profanity,Alcohol,Frightening
2084,King Kong,2005,7.2,407876,"Action, Adventure, Drama",187,Film,PG-13,-,Mild,Moderate,Mild,Mild,Moderate
4207,King Kong,1976,5.9,31536,"Adventure, Horror",134,Film,PG,-,Mild,Moderate,Mild,Mild,Moderate
4951,King Kong,1933,7.9,82705,"Adventure, Horror, Sci-Fi",100,Film,Passed,-,Mild,Moderate,,Mild,Moderate


In [17]:
duplicateRows = IMDB_df[IMDB_df.duplicated()]
duplicateRows

Unnamed: 0,Name,Date,Rate,Votes,Genre,Duration,Type,Certificate,Episodes,Nudity,Violence,Profanity,Alcohol,Frightening


In [18]:
duplicate_Names = IMDB_df[IMDB_df.duplicated('Name')]
duplicate_Names

Unnamed: 0,Name,Date,Rate,Votes,Genre,Duration,Type,Certificate,Episodes,Nudity,Violence,Profanity,Alcohol,Frightening
62,Dune,1984,6.4,142157,"Action, Adventure, Sci-Fi",137,Film,PG-13,-,Mild,Moderate,,Mild,Moderate
167,Ghosts,2021,8.2,8577,"Comedy, Fantasy",30,Series,TV-14,19,Mild,Mild,Mild,Mild,
171,Halloween,2018,6.5,132934,"Crime, Horror, Thriller",106,Film,R,-,Mild,Severe,Moderate,Mild,Severe
176,Scream,1996,7.3,304350,"Horror, Mystery",111,Film,R,-,Mild,Severe,Moderate,Mild,Severe
213,The Addams Family,1991,6.9,143815,"Comedy, Fantasy",99,Film,PG-13,-,Mild,Moderate,Mild,Mild,Mild
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4904,Nikita,1990,7.3,69949,"Action, Thriller",117,Film,R,-,Mild,Severe,Moderate,Mild,Moderate
4939,Night of the Living Dead,1990,6.9,40684,Horror,92,Film,R,-,Mild,Severe,Moderate,Mild,Moderate
4951,King Kong,1933,7.9,82705,"Adventure, Horror, Sci-Fi",100,Film,Passed,-,Mild,Moderate,,Mild,Moderate
4963,The Dukes of Hazzard,2005,5.1,77398,Comedy,104,Film,PG-13,-,Moderate,Mild,Moderate,Mild,


In [19]:
#Finding Columns with Missing Values
IMDB_df.isna().any()

Name           False
Date           False
Rate           False
Votes          False
Genre          False
Duration       False
Type           False
Certificate    False
Episodes       False
Nudity         False
Violence       False
Profanity      False
Alcohol        False
Frightening    False
dtype: bool

In [21]:
IMDB_df.shape

(5028, 14)

In [22]:
for i in range(0,IMDB_df.shape[0]):
  IMDB_df['Votes'][i] = IMDB_df['Votes'][i].replace(',', '')

IMDB_df['Votes']

0       107163
1        64375
2        27145
3        30443
4        84636
         ...  
5023     37492
5024    150448
5025    100668
5026       887
5027    203578
Name: Votes, Length: 5028, dtype: object

In [23]:
IMDB_df['Rate'] = pd.to_numeric(IMDB_df['Rate'], errors='coerce', downcast="integer")
IMDB_df['Votes'] = pd.to_numeric(IMDB_df['Votes'], errors='coerce', downcast="integer")
IMDB_df['Duration'] = pd.to_numeric(IMDB_df['Duration'], errors='coerce', downcast="integer")

In [25]:
IMDB_df.info()
IMDB_df

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5028 entries, 0 to 5027
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         5028 non-null   object 
 1   Date         5028 non-null   int64  
 2   Rate         4875 non-null   float64
 3   Votes        4875 non-null   float64
 4   Genre        5028 non-null   object 
 5   Duration     4780 non-null   float64
 6   Type         5028 non-null   object 
 7   Certificate  5028 non-null   object 
 8   Episodes     5028 non-null   object 
 9   Nudity       5028 non-null   object 
 10  Violence     5028 non-null   object 
 11  Profanity    5028 non-null   object 
 12  Alcohol      5028 non-null   object 
 13  Frightening  5028 non-null   object 
dtypes: float64(3), int64(1), object(10)
memory usage: 718.3+ KB


Unnamed: 0,Name,Date,Rate,Votes,Genre,Duration,Type,Certificate,Episodes,Nudity,Violence,Profanity,Alcohol,Frightening
0,No Time to Die,2021,7.6,107163.0,"Action, Adventure, Thriller",163.0,Film,PG-13,-,Mild,Moderate,Mild,Mild,Moderate
1,The Guilty,2021,6.3,64375.0,"Crime, Drama, Thriller",90.0,Film,R,-,,,Severe,,Moderate
2,The Many Saints of Newark,2021,6.4,27145.0,"Crime, Drama",120.0,Film,R,-,Moderate,Severe,Severe,Moderate,Moderate
3,Venom: Let There Be Carnage,2021,6.4,30443.0,"Action, Adventure, Sci-Fi",97.0,Film,PG-13,-,,Moderate,Moderate,Mild,Moderate
4,Dune,2021,8.3,84636.0,"Action, Adventure, Drama",155.0,Film,PG-13,-,,Moderate,,Mild,Moderate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5023,The Human Centipede II (Full Sequence),2011,3.8,37492.0,Horror,91.0,Film,Not Rated,-,Severe,Severe,Severe,Mild,Severe
5024,Double Indemnity,1944,8.3,150448.0,"Crime, Drama, Film-Noir",107.0,Film,Passed,-,,Mild,,Mild,Mild
5025,Before the Devil Knows You're Dead,2007,7.3,100668.0,"Crime, Drama, Thriller",117.0,Film,R,-,Severe,Moderate,Severe,Severe,Severe
5026,Queen Bees,2021,6.0,887.0,"Comedy, Drama, Romance",100.0,Film,PG-13,-,,,Mild,Moderate,


In [26]:
IMDB_df.isnull().sum()

Name             0
Date             0
Rate           153
Votes          153
Genre            0
Duration       248
Type             0
Certificate      0
Episodes         0
Nudity           0
Violence         0
Profanity        0
Alcohol          0
Frightening      0
dtype: int64

In [27]:
IMDB_df['Rate'] = IMDB_df['Rate'].replace(np.nan, 0)
IMDB_df['Votes'] = IMDB_df['Votes'].replace(np.nan, 0)
IMDB_df['Duration'] = IMDB_df['Duration'].replace(np.nan, 0)

In [28]:
IMDB_df.isna().sum()

Name           0
Date           0
Rate           0
Votes          0
Genre          0
Duration       0
Type           0
Certificate    0
Episodes       0
Nudity         0
Violence       0
Profanity      0
Alcohol        0
Frightening    0
dtype: int64

In [29]:
#Again describe
IMDB_df.describe()

Unnamed: 0,Date,Rate,Votes,Duration
count,5028.0,5028.0,5028.0,5028.0
mean,2006.159905,6.718894,132737.3,90.015115
std,15.789378,1.590262,206885.0,45.909037
min,1922.0,0.0,0.0,0.0
25%,1998.0,6.2,13352.75,60.0
50%,2011.0,7.0,60114.0,98.0
75%,2019.0,7.7,166921.0,116.0
max,2023.0,9.7,2474122.0,900.0


In [30]:
IMDB_df.describe(include = 'object')

Unnamed: 0,Name,Genre,Type,Certificate,Episodes,Nudity,Violence,Profanity,Alcohol,Frightening
count,5028,5028,5028,5028,5028,5028,5028,5028,5028,5028
unique,4820,377,2,23,284,5,5,5,5,5
top,The Equalizer,Comedy,Film,R,-,Mild,Moderate,Mild,Mild,Moderate
freq,3,210,3586,1545,3586,1909,1509,1720,2694,1635
