# Agora dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('agora.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [3]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [4]:
df.Area.value_counts()

Dhaka      17
Ctg        17
Rangpur    16
Name: Area, dtype: int64

In [5]:
mean = df.Transport.mean()
mean

215331.7324489796

In [6]:
df.Transport=df.Transport.fillna(mean)

In [7]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

In [8]:
df.shape

(50, 5)

# Encoding with Replace Function

In [9]:
df.Area.unique()

array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)

In [10]:
df.Area = df.Area.replace(['Ctg', 'Dhaka', 'Rangpur'], [0, 1, 2])

In [11]:
df.Area.head()

0    1
1    0
2    2
3    1
4    2
Name: Area, dtype: int64

In [12]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


# Encoding with Label Encoder

In [13]:
df = pd.read_csv('agora.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [15]:
df.Area = le.fit_transform(df.Area)
df.Area.head()

0    1
1    0
2    2
3    1
4    2
Name: Area, dtype: int32

In [16]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


# Encoding with One Hot Encoder

In [17]:
df = pd.read_csv('agora.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [18]:
df['Area'].value_counts()

Dhaka      17
Ctg        17
Rangpur    16
Name: Area, dtype: int64

In [19]:
one_hot_encoder = pd.get_dummies(df, columns = ['Area'])
one_hot_encoder.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Area_Ctg,Area_Dhaka,Area_Rangpur
0,114523.61,136897.8,471784.1,192261.83,0,1,0
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,0,1
3,144372.41,118671.85,383199.62,182901.99,0,1,0
4,142107.34,91391.77,366168.42,166187.94,0,0,1


In [20]:
drop_encoder = pd.get_dummies(df, columns = ['Area'], drop_first=True)
drop_encoder.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Area_Dhaka,Area_Rangpur
0,114523.61,136897.8,471784.1,192261.83,1,0
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1
3,144372.41,118671.85,383199.62,182901.99,1,0
4,142107.34,91391.77,366168.42,166187.94,0,1


# Encoding with Ordinal Encoder

In [21]:
df = pd.read_csv('agora.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [22]:
df.Area.unique()

array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)

In [23]:
category = ['Ctg', 'Dhaka', 'Rangpur']

In [24]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=[category])

In [25]:
df.Area = oe.fit_transform(df[['Area']])
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1.0,192261.83
1,162597.7,151377.59,443898.53,0.0,191792.06
2,153441.51,101145.55,407934.54,2.0,191050.39
3,144372.41,118671.85,383199.62,1.0,182901.99
4,142107.34,91391.77,366168.42,2.0,166187.94


# Insurance dataset

In [26]:
import pandas as pd
df2 = pd.read_csv('insurance.csv')
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [27]:
df2.shape

(1338, 7)

# Encoding with Replace Function

In [28]:
df2.sex.unique()

array(['female', 'male'], dtype=object)

In [29]:
df2.sex = df2.sex.replace(['male','female'],[1,2])
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,2,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


In [30]:
df2.smoker.unique()

array(['yes', 'no'], dtype=object)

In [31]:
df2.smoker = df2.smoker.replace(['no', 'yes'],[0, 1])
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,2,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


In [32]:
df2.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [33]:
df2.region = df2.region.replace(['northeast', 'northwest', 'southeast', 'southwest'], [0, 1, 2, 3])
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,2,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


# Label Encoder

In [34]:
df2 = pd.read_csv('insurance.csv')
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [35]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [36]:
for col in ['sex', 'smoker', 'region']:
    df2[col] = le.fit_transform(df2[col])
    
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


# One Hot Encoder

In [37]:
import pandas as pd
df2 = pd.read_csv('insurance.csv')
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [38]:
one_hot_encoder = pd.get_dummies(df2, columns = ['sex', 'smoker', 'region'], drop_first=True)
one_hot_encoder.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


# Ordinal Encoder

In [39]:
import pandas as pd
df2 = pd.read_csv('insurance.csv')
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [40]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()

In [41]:
df2[['sex', 'smoker', 'region']] = oe.fit_transform(df2[['sex', 'smoker', 'region']])
df2.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,1.0,3.0,16884.924
1,18,1.0,33.77,1,0.0,2.0,1725.5523
2,28,1.0,33.0,3,0.0,2.0,4449.462
3,33,1.0,22.705,0,0.0,1.0,21984.47061
4,32,1.0,28.88,0,0.0,1.0,3866.8552
5,31,0.0,25.74,0,0.0,2.0,3756.6216
6,46,0.0,33.44,1,0.0,2.0,8240.5896
7,37,0.0,27.74,3,0.0,1.0,7281.5056
8,37,1.0,29.83,2,0.0,0.0,6406.4107
9,60,0.0,25.84,0,0.0,1.0,28923.13692
