In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("supershop.csv")
df.head( )

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [3]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

# Handle NaN

In [4]:
mean = df.Transport.mean()

In [5]:
#check sum
df.Transport.sum() / 49

215331.7324489796

In [6]:
df.Transport.shape

(50,)

In [7]:
df.isnull() # check missing value in each index

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
5,False,False,False,False,False
6,False,False,False,False,False
7,False,False,False,False,False
8,False,False,False,False,False
9,False,False,False,False,False


In [8]:
df.Transport = df.Transport.fillna(mean)
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

In [9]:
median = df.Transport.median()

In [10]:
median

214983.2712244898

# Without Encoding

In [11]:
df.Area.unique()

array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)

In [12]:
df.Area = df.Area.replace(['Dhaka', 'Ctg', 'Rangpur'], [3,2,1]) # use only replace

In [13]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,3,192261.83
1,162597.7,151377.59,443898.53,2,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,3,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


# Label Encoder

In [14]:
df = pd.read_csv('supershop.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
le = LabelEncoder()

In [17]:
df.Area = le.fit_transform(df.Area)

In [18]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


# Label Encoder with Loop

In [19]:
df = pd.read_csv('supershop.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [20]:
for column in df.columns:
    if df[column].dtype == np.number:
        continue
    df[column] = le.fit_transform(df[column])

  if df[column].dtype == np.number:


# One Hot Encoder

In [21]:
df = pd.read_csv('supershop.csv')

In [22]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [23]:
dummy = pd.get_dummies(df['Area'])

In [24]:
dummy.head()

Unnamed: 0,Ctg,Dhaka,Rangpur
0,0,1,0
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


In [25]:
df = df.drop('Area', axis=1)

In [26]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
0,114523.61,136897.8,471784.1,192261.83
1,162597.7,151377.59,443898.53,191792.06
2,153441.51,101145.55,407934.54,191050.39
3,144372.41,118671.85,383199.62,182901.99
4,142107.34,91391.77,366168.42,166187.94


In [27]:
df2 = pd.concat([df,dummy],axis=1)

In [28]:
df2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Ctg,Dhaka,Rangpur
0,114523.61,136897.8,471784.1,192261.83,0,1,0
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,0,1
3,144372.41,118671.85,383199.62,182901.99,0,1,0
4,142107.34,91391.77,366168.42,166187.94,0,0,1


In [29]:
x = df2.drop('Profit', axis=1)
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Ctg,Dhaka,Rangpur
0,114523.61,136897.8,471784.1,0,1,0
1,162597.7,151377.59,443898.53,1,0,0
2,153441.51,101145.55,407934.54,0,0,1
3,144372.41,118671.85,383199.62,0,1,0
4,142107.34,91391.77,366168.42,0,0,1


In [30]:
y = df2.Profit
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

# Ordinal Encoding 

In [31]:
df = pd.read_csv('supershop.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [32]:
df.Area.unique()

array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)

In [33]:
cities = ['Dhaka', 'Ctg', 'Rangpur']

In [34]:
from sklearn.preprocessing import OrdinalEncoder

In [35]:
ordinal = OrdinalEncoder(categories=[cities])

In [36]:
encoding = ordinal.fit_transform(df[['Area']])

In [37]:
new_cities = pd.DataFrame(encoding, columns=['Area'])

In [38]:
new_cities.head()

Unnamed: 0,Area
0,0.0
1,1.0
2,2.0
3,0.0
4,2.0


In [39]:
df = df.drop('Area', axis=1)
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
0,114523.61,136897.8,471784.1,192261.83
1,162597.7,151377.59,443898.53,191792.06
2,153441.51,101145.55,407934.54,191050.39
3,144372.41,118671.85,383199.62,182901.99
4,142107.34,91391.77,366168.42,166187.94


In [40]:
df2 = pd.concat([df,new_cities],axis=1)
df2.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Area
0,114523.61,136897.8,471784.1,192261.83,0.0
1,162597.7,151377.59,443898.53,191792.06,1.0
2,153441.51,101145.55,407934.54,191050.39,2.0
3,144372.41,118671.85,383199.62,182901.99,0.0
4,142107.34,91391.77,366168.42,166187.94,2.0


In [41]:
x = df2.drop('Profit', axis=1)
x.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area
0,114523.61,136897.8,471784.1,0.0
1,162597.7,151377.59,443898.53,1.0
2,153441.51,101145.55,407934.54,2.0
3,144372.41,118671.85,383199.62,0.0
4,142107.34,91391.77,366168.42,2.0


In [42]:
y = df.Profit
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

# Hashing Encoder

In [43]:
df = pd.read_csv('supershop.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [44]:
!pip install category_encoder

ERROR: Could not find a version that satisfies the requirement category_encoder
ERROR: No matching distribution found for category_encoder


In [45]:
import category_encoders as ce

In [46]:
encoder=ce.HashingEncoder(cols='Area',n_components=3)

In [47]:
encoder.fit_transform(df.Area)

Unnamed: 0,col_0,col_1,col_2
0,0,1,0
1,0,0,1
2,1,0,0
3,0,1,0
4,1,0,0
5,0,1,0
6,0,0,1
7,1,0,0
8,0,1,0
9,0,0,1
