In [39]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, KBinsDiscretizer
import pandas as pd

In [40]:
df=pd.read_csv(r"C:\Users\rhyth\Desktop\data_science\data_transformation\data\imdb_top_250.csv")

In [41]:
df.head()

Unnamed: 0,Rank,Title,Year,Rating,Runtime
0,1,The Shawshank Redemption,1994,9.3,142
1,2,The Godfather,1972,9.2,175
2,3,The Dark Knight,2008,9.0,152
3,4,The Godfather Part II,1974,9.0,202
4,5,12 Angry Men,1957,9.0,96


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Rank     250 non-null    int64  
 1   Title    250 non-null    object 
 2   Year     250 non-null    int64  
 3   Rating   250 non-null    float64
 4   Runtime  250 non-null    int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 9.9+ KB


In [44]:
# 1️⃣ Data Type Conversion: Ensure correct data types
df['Year'] = df['Year'].astype(int)
df['Runtime'] = df['Runtime'].astype(float)
print("\nAfter Data Type Conversion:")
print(df.dtypes)


After Data Type Conversion:
Rank         int64
Title       object
Year         int64
Rating     float64
Runtime    float64
dtype: object


In [45]:
df.head()

Unnamed: 0,Rank,Title,Year,Rating,Runtime
0,1,The Shawshank Redemption,1994,9.3,142.0
1,2,The Godfather,1972,9.2,175.0
2,3,The Dark Knight,2008,9.0,152.0
3,4,The Godfather Part II,1974,9.0,202.0
4,5,12 Angry Men,1957,9.0,96.0


In [None]:
int-float
float-int
float-double
double-float
int-string
string-int

In [46]:
a='23'
b=int(a)

In [47]:
b

23

In [48]:
a='sir'
b=int(a)

ValueError: invalid literal for int() with base 10: 'sir'

In [49]:
# 2️⃣ Normalization of Rating and Runtime
norm_scaler = MinMaxScaler()
df[['Rating_Norm', 'Runtime_Norm']] = norm_scaler.fit_transform(df[['Rating', 'Runtime']])
print("\nBefore Normalization:")
print(df[['Rating', 'Runtime']])
print("\nAfter Normalization:")
print(df[['Rating_Norm', 'Runtime_Norm']])



Before Normalization:
     Rating  Runtime
0       9.3    142.0
1       9.2    175.0
2       9.0    152.0
3       9.0    202.0
4       9.0     96.0
..      ...      ...
245     8.0    142.0
246     8.0     92.0
247     8.0    132.0
248     8.0     89.0
249     8.0    138.0

[250 rows x 2 columns]

After Normalization:
     Rating_Norm  Runtime_Norm
0       1.000000      0.363296
1       0.923077      0.486891
2       0.769231      0.400749
3       0.769231      0.588015
4       0.769231      0.191011
..           ...           ...
245     0.000000      0.363296
246     0.000000      0.176030
247     0.000000      0.325843
248     0.000000      0.164794
249     0.000000      0.348315

[250 rows x 2 columns]


In [50]:
# 3️⃣ Standardization of Rating and Runtime
std_scaler = StandardScaler()
df[['Rating_Std', 'Runtime_Std']] = std_scaler.fit_transform(df[['Rating', 'Runtime']])
print("\nAfter Standardization:")
print(df[['Rating_Std', 'Runtime_Std']])


After Standardization:
     Rating_Std  Runtime_Std
0      4.099278     0.409137
1      3.707677     1.456415
2      2.924476     0.726494
3      2.924476     2.313278
4      2.924476    -1.050705
..          ...          ...
245   -0.991534     0.409137
246   -0.991534    -1.177648
247   -0.991534     0.091780
248   -0.991534    -1.272855
249   -0.991534     0.282194

[250 rows x 2 columns]


In [51]:

# 4️⃣ Discretization (binning) of Runtime into 3 categories: Short, Medium, Long
discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
df['Runtime_Binned'] = discretizer.fit_transform(df[['Runtime']])

In [52]:
# Optionally map bins to labels
bin_labels = {0.0: 'Short', 1.0: 'Medium', 2.0: 'Long'}
df['Runtime_Binned_Label'] = df['Runtime_Binned'].map(bin_labels)

print("\nAfter Discretization (Binning Runtime):")
print(df[['Runtime', 'Runtime_Binned', 'Runtime_Binned_Label']])


After Discretization (Binning Runtime):
     Runtime  Runtime_Binned Runtime_Binned_Label
0      142.0             1.0               Medium
1      175.0             1.0               Medium
2      152.0             1.0               Medium
3      202.0             1.0               Medium
4       96.0             0.0                Short
..       ...             ...                  ...
245    142.0             1.0               Medium
246     92.0             0.0                Short
247    132.0             0.0                Short
248     89.0             0.0                Short
249    138.0             1.0               Medium

[250 rows x 3 columns]


In [53]:
df

Unnamed: 0,Rank,Title,Year,Rating,Runtime,Rating_Norm,Runtime_Norm,Rating_Std,Runtime_Std,Runtime_Binned,Runtime_Binned_Label
0,1,The Shawshank Redemption,1994,9.3,142.0,1.000000,0.363296,4.099278,0.409137,1.0,Medium
1,2,The Godfather,1972,9.2,175.0,0.923077,0.486891,3.707677,1.456415,1.0,Medium
2,3,The Dark Knight,2008,9.0,152.0,0.769231,0.400749,2.924476,0.726494,1.0,Medium
3,4,The Godfather Part II,1974,9.0,202.0,0.769231,0.588015,2.924476,2.313278,1.0,Medium
4,5,12 Angry Men,1957,9.0,96.0,0.769231,0.191011,2.924476,-1.050705,0.0,Short
...,...,...,...,...,...,...,...,...,...,...,...
245,246,Dersu Uzala,1975,8.0,142.0,0.000000,0.363296,-0.991534,0.409137,1.0,Medium
246,247,Monsters Inc.,2001,8.0,92.0,0.000000,0.176030,-0.991534,-1.177648,0.0,Short
247,248,Miracle in Cell No. 7,2019,8.0,132.0,0.000000,0.325843,-0.991534,0.091780,0.0,Short
248,249,Children of Heaven,1997,8.0,89.0,0.000000,0.164794,-0.991534,-1.272855,0.0,Short
