In [1]:
import pandas as pd
import numpy as np #mathematical library

In [2]:
df = pd.read_csv('supershops.csv')

In [3]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


## Handle Null values

In [4]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [5]:
df.Transport = df.Transport.fillna(df.Transport.mean())
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

In [6]:
df_mms = df.copy()
df_n = df.copy()
df_s = df.copy()
df_ft = df.copy()
df_rs = df.copy()
df_mas = df.copy()

1. Normalization
2. Standardization
3. Log Transformation
4. Robust Scaler
5. Max Absolute Scaler


In [7]:
#import libreries
from sklearn.preprocessing import MinMaxScaler,StandardScaler,FunctionTransformer,RobustScaler,MaxAbsScaler

# Normalization

In [8]:
#https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

In [9]:
MMS = MinMaxScaler() #default feature_range=(0,1)
df_ms = MMS.fit(df[['Marketing Spend']]) # calculation everything

In [10]:
df_ms

In [11]:
df_mms[['Marketing Spend New']] = MMS.transform(df[['Marketing Spend']])
df_mms.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,Marketing Spend New
0,114523.61,136897.8,471784.1,Dhaka,192261.83,0.692617
1,162597.7,151377.59,443898.53,Ctg,191792.06,0.983359
2,153441.51,101145.55,407934.54,Rangpur,191050.39,0.927985
3,144372.41,118671.85,383199.62,Dhaka,182901.99,0.873136
4,142107.34,91391.77,366168.42,Rangpur,166187.94,0.859438


# Perform Normalization using loop

In [12]:
for col in df.columns:
    if df[col].dtypes == 'float64' and col != 'Profit':
        df_n[[col]] = MMS.fit_transform(df[[col]])
df_n.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,0.651744,1.0,Dhaka,192261.83
1,0.983359,0.761972,0.940893,Ctg,191792.06
2,0.927985,0.379579,0.864664,Rangpur,191050.39
3,0.873136,0.512998,0.812235,Dhaka,182901.99
4,0.859438,0.305328,0.776136,Rangpur,166187.94


# Standardization

In [13]:
SS = StandardScaler()
for col in df.columns:
    if df[col].dtypes == 'float64' and col != 'Profit':
        df_s[[col]] = SS.fit_transform(df[[col]])
df_s.sample(5)

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
10,0.620398,-0.387599,0.117949,Rangpur,146121.95
13,0.402078,0.510179,0.318413,Ctg,134307.35
49,-1.622362,-0.157226,-1.451276,Ctg,14681.4
27,-0.035519,0.235069,1.175734,Dhaka,105008.31
23,-0.136201,-0.562211,0.762804,Rangpur,108733.99


# Log Transformation

In [14]:
#https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html

In [15]:
FT = FunctionTransformer(np.log1p) #np.log1p will avoid 0

In [16]:
df_ft['Transport'] = FT.fit_transform(df[['Transport']])

In [17]:
df_ft.head(5)

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,13.064279,Dhaka,192261.83
1,162597.7,151377.59,13.003354,Ctg,191792.06
2,153441.51,101145.55,12.918864,Rangpur,191050.39
3,144372.41,118671.85,12.856314,Dhaka,182901.99
4,142107.34,91391.77,12.810851,Rangpur,166187.94


# Perform Log using loop

In [18]:
for (col, colData) in df.items():
    if df[col].dtypes == 'float64' and col != 'Profit':
        df_ft[[col]] = FT.fit_transform(df[[col]])
df_ft.head(5)

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,11.648545,11.826997,13.064279,Dhaka,192261.83
1,11.99904,11.927539,13.003354,Ctg,191792.06
2,11.941081,11.524326,12.918864,Rangpur,191050.39
3,11.880158,11.684126,12.856314,Dhaka,182901.99
4,11.864345,11.422922,12.810851,Rangpur,166187.94


# Robust Scaler

In [19]:
RS = RobustScaler()

In [20]:
for (col, colData) in df.items():
    if df[col].dtypes == 'float64' and col != 'Profit':
        df_rs[[col]] = RS.fit_transform(df[[col]])
df_rs.head(5)

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.67253,0.345355,1.561661,Dhaka,192261.83
1,1.452113,0.697565,1.392082,Ctg,191792.06
2,1.303634,-0.52429,1.173378,Rangpur,191050.39
3,1.156567,-0.097977,1.022959,Dhaka,182901.99
4,1.119836,-0.761543,0.919389,Rangpur,166187.94


# Max Absolute Scaler

In [21]:
MAS = MaxAbsScaler()

In [22]:
for (col, colData) in df.items():
    if df[col].dtypes == 'float64' and col != 'Profit':
        df_mas[[col]] = MAS.fit_transform(df[[col]])
df_mas.head(5)

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0.692617,0.749527,1.0,Dhaka,192261.83
1,0.983359,0.828805,0.940893,Ctg,191792.06
2,0.927985,0.553781,0.864664,Rangpur,191050.39
3,0.873136,0.649738,0.812235,Dhaka,182901.99
4,0.859438,0.500378,0.776136,Rangpur,166187.94
