In [103]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,MinMaxScaler

In [104]:
df = pd.read_csv('IMDB_top250.csv')

In [105]:
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Genre,Year,Duration,Average_Rating,Num_of_Rating
0,0,1. The Shawshank Redemption,Drama,1994,2h 22m,9.3,(3.1M)
1,1,2. The Godfather,Drama,1972,2h 55m,9.2,(2.2M)
2,2,3. The Dark Knight,Thriller,2008,2h 32m,9.1,(3.1M)
3,3,4. The Godfather Part II,Drama,1974,3h 22m,9.0,(1.4M)
4,4,5. 12 Angry Men,Drama,1957,1h 36m,9.0,(946K)


### Data Cleaning

In [106]:
# Drop the Extra Column

df = df.drop(columns= 'Unnamed: 0')

In [107]:
# Rename column for better understanding

df.rename(columns= {'Num_of_Rating':'Num_of_Rating(in K)'},inplace = True)

In [108]:
df.head()

Unnamed: 0,Name,Genre,Year,Duration,Average_Rating,Num_of_Rating(in K)
0,1. The Shawshank Redemption,Drama,1994,2h 22m,9.3,(3.1M)
1,2. The Godfather,Drama,1972,2h 55m,9.2,(2.2M)
2,3. The Dark Knight,Thriller,2008,2h 32m,9.1,(3.1M)
3,4. The Godfather Part II,Drama,1974,3h 22m,9.0,(1.4M)
4,5. 12 Angry Men,Drama,1957,1h 36m,9.0,(946K)


In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Name                 250 non-null    object 
 1   Genre                250 non-null    object 
 2   Year                 250 non-null    int64  
 3   Duration             250 non-null    object 
 4   Average_Rating       250 non-null    float64
 5   Num_of_Rating(in K)  250 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 11.8+ KB


In [110]:
# Cleaning the Name Column
df['Name'] = df['Name'].str.split(' ').str.slice(1).str.join(' ').str.strip()

In [111]:
# Cleaning the Duration Columns
# -> First Replace all Characters which white space
# -> Then convert the hour and minute into decimal format
df['Duration'] = df['Duration'].replace({' ':'.','h':'','m':''},regex=True).apply(lambda x: '0.'+ x if len(x) <= 2 else x).astype('float32')

In [112]:
# Cleaning the Num of Rating Column

df['Num_of_Rating(in K)'] = df['Num_of_Rating(in K)'].str.slice(2,-1).apply(lambda x: float(x[:-1])*1000 if x[-1] == 'M' else x[:-1]).astype('int')

In [113]:
df.head()

Unnamed: 0,Name,Genre,Year,Duration,Average_Rating,Num_of_Rating(in K)
0,The Shawshank Redemption,Drama,1994,2.22,9.3,3100
1,The Godfather,Drama,1972,2.55,9.2,2200
2,The Dark Knight,Thriller,2008,2.32,9.1,3100
3,The Godfather Part II,Drama,1974,3.22,9.0,1400
4,12 Angry Men,Drama,1957,1.36,9.0,946


In [114]:
# Created a copy of Data for Further Analysis

df2 = df.copy()

### Encoding

In [115]:
le = LabelEncoder()

In [116]:
df2[['Name','Genre']] = df2[['Name','Genre']].apply(le.fit_transform)

### Normalization

In [117]:
scaler = MinMaxScaler()

In [118]:
scaled = scaler.fit_transform(df2)

In [119]:
scaled_df = pd.DataFrame(scaled,columns=df2.columns)

In [120]:
scaled_df.head()

Unnamed: 0,Name,Genre,Year,Duration,Average_Rating,Num_of_Rating(in K)
0,0.855422,0.217391,0.708738,0.340067,1.0,1.0
1,0.746988,0.217391,0.495146,0.395623,0.923077,0.707222
2,0.714859,0.913043,0.84466,0.356902,0.846154,1.0
3,0.751004,0.217391,0.514563,0.508418,0.769231,0.446975
4,0.0,0.217391,0.349515,0.195286,0.769231,0.299284


In [121]:
df.to_csv('IMDB_Top250_Cleaned.csv',index=False)