# Supershops data

In [68]:
import pandas as pd 
import warnings
warnings.filterwarnings('ignore') 

In [69]:
df1 = pd.read_csv('supershops.csv') 

In [70]:
df1.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [71]:
x =  df1.drop('Profit', axis=1)

In [72]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area
0,114523.61,136897.8,471784.1,Dhaka
1,162597.7,151377.59,443898.53,Ctg
2,153441.51,101145.55,407934.54,Rangpur
3,144372.41,118671.85,383199.62,Dhaka
4,142107.34,91391.77,366168.42,Rangpur


# Measure of Central Tendency

In [73]:
df1.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [74]:
df1[19:20]

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
19,86419.7,153514.11,,Dhaka,122776.86


In [75]:
df1.Transport.mean() #not for outlier

215331.73244897963

In [76]:
df1.Transport.median() #for outliers

214634.81

In [77]:
df1.Transport.max()

471784.1

In [78]:
df1.Transport.min()

0.0

In [79]:
df1.Transport = df1.Transport.fillna(df1.Transport.mean())

In [80]:
df1.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

In [81]:
df1[19:20]

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
19,86419.7,153514.11,215331.732449,Dhaka,122776.86


# Label Encoder

In [82]:
def label_encoder(df1, col):
    label_map = {}
    encoded_value = []
    
    for instance in df1[col]:
        if instance not in label_map:
            label_map[instance] = len(label_map)
        encoded_value.append(label_map[instance])
    return encoded_value, label_map

encoded_value, label_map = label_encoder(x, 'Area')
x.Area = encoded_value

In [83]:
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area
0,114523.61,136897.8,471784.1,0
1,162597.7,151377.59,443898.53,1
2,153441.51,101145.55,407934.54,2
3,144372.41,118671.85,383199.62,0
4,142107.34,91391.77,366168.42,2


In [84]:
from sklearn.preprocessing import LabelEncoder

In [85]:
le = LabelEncoder()

In [86]:
df2 = df1.copy()

In [87]:
df2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [88]:
df2.Area = le.fit_transform(df2[['Area']])

In [89]:
df2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


Using For Loop

In [90]:
df3 = pd.read_csv('supershops.csv') 
df3.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [91]:
cat_cols = df3.select_dtypes(include=['object']).columns

for column in cat_cols:
    df3[column] = le.fit_transform(df3[column])

df3.head(n=5)

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


# One Hot Encoder

In [92]:
df4 = df1.copy()

In [93]:
pd.get_dummies(df4, prefix='area', drop_first=True).head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,area_Dhaka,area_Rangpur
0,114523.61,136897.8,471784.1,192261.83,1,0
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1
3,144372.41,118671.85,383199.62,182901.99,1,0
4,142107.34,91391.77,366168.42,166187.94,0,1


Using For Loop

In [94]:
df5 = pd.read_csv('supershops.csv') 
df5.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [96]:
from sklearn.preprocessing import OneHotEncoder

cat_cols = df5.select_dtypes(include=['object']).columns

hot_encoder = OneHotEncoder(sparse=False)

for column in cat_cols:
    encoded_data = hot_encoder.fit_transform(df5[[column]])
    encoded_df = pd.DataFrame(encoded_data, columns=[f"{column}_{category}" for category in hot_encoder.categories_[0]])
    df6 = df5.drop(column, axis=1).join(encoded_df)

df6 = df6.drop('Area_Ctg', axis=1)
df6.head(n=5)

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Area_Dhaka,Area_Rangpur
0,114523.61,136897.8,471784.1,192261.83,1.0,0.0
1,162597.7,151377.59,443898.53,191792.06,0.0,0.0
2,153441.51,101145.55,407934.54,191050.39,0.0,1.0
3,144372.41,118671.85,383199.62,182901.99,1.0,0.0
4,142107.34,91391.77,366168.42,166187.94,0.0,1.0


# ORDINAL Enocder

In [115]:
df7 = df1.copy()
df7.Area.unique()

array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)

In [116]:
from sklearn.preprocessing import OrdinalEncoder

In [117]:
ordinal = OrdinalEncoder(categories = [['Ctg', 'Dhaka', 'Rangpur']])

In [118]:
df7.Area = ordinal.fit_transform(df7[['Area']])

In [119]:
df7.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1.0,192261.83
1,162597.7,151377.59,443898.53,0.0,191792.06
2,153441.51,101145.55,407934.54,2.0,191050.39
3,144372.41,118671.85,383199.62,1.0,182901.99
4,142107.34,91391.77,366168.42,2.0,166187.94


Using For Loop

In [113]:
df8 = pd.read_csv('supershops.csv') 
df8.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [114]:
cat_cols = df8.select_dtypes(include=['object']).columns

ordinal_encoder = OrdinalEncoder()

for column in cat_cols:
    df8[column] = ordinal_encoder.fit_transform(df8[[column]])

df8.head(n=5)

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1.0,192261.83
1,162597.7,151377.59,443898.53,0.0,191792.06
2,153441.51,101145.55,407934.54,2.0,191050.39
3,144372.41,118671.85,383199.62,1.0,182901.99
4,142107.34,91391.77,366168.42,2.0,166187.94
