In [1]:
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Supershop.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,1,162597.7,151377.59,443898.53,Ctg,191792.06
2,2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,4,142107.34,91391.77,366168.42,Rangpur,166187.94


# Measure Of Tendancy

In [4]:
df.isnull().sum()

Unnamed: 0         0
Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

In [5]:
df[18:20] # to see the NUN value.

Unnamed: 0.1,Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
18,18,91749.16,114175.79,294919.57,Rangpur,124266.9
19,19,86419.7,153514.11,,Dhaka,122776.86


In [6]:
m = df.Transport.mean() # not for outlier
m

215331.7324489796

In [7]:
df.Transport.median() # for outlier

214634.81

In [8]:
df.Transport = df.Transport.fillna(m)

In [9]:
df.isnull().sum()

Unnamed: 0         0
Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

In [10]:
df[18: 20]

Unnamed: 0.1,Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
18,18,91749.16,114175.79,294919.57,Rangpur,124266.9
19,19,86419.7,153514.11,215331.732449,Dhaka,122776.86


# Label Encoding

In [11]:
x = df.drop('Profit', axis = 1)

In [12]:
x.head()

Unnamed: 0.1,Unnamed: 0,Marketing Spend,Administration,Transport,Area
0,0,114523.61,136897.8,471784.1,Dhaka
1,1,162597.7,151377.59,443898.53,Ctg
2,2,153441.51,101145.55,407934.54,Rangpur
3,3,144372.41,118671.85,383199.62,Dhaka
4,4,142107.34,91391.77,366168.42,Rangpur


In [13]:
def label_encoded(df, col_name):
    
    label_map = {}
    encoded_value = []
    
    for instance in df[col_name]:
        # print(instance)
        if instance not in label_map:
            label_map[instance] = len(label_map)
        encoded_value.append(label_map[instance])
        
    return encoded_value, label_map


encoded_value, label_map = label_encoded(x, 'Area')

In [14]:
x.Area = encoded_value

In [15]:
x.head()

Unnamed: 0.1,Unnamed: 0,Marketing Spend,Administration,Transport,Area
0,0,114523.61,136897.8,471784.1,0
1,1,162597.7,151377.59,443898.53,1
2,2,153441.51,101145.55,407934.54,2
3,3,144372.41,118671.85,383199.62,0
4,4,142107.34,91391.77,366168.42,2


# Label Encoding Sklearn

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
le = LabelEncoder()

In [18]:
df1 = x.copy()

In [19]:
df1.Area = le.fit_transform(df[['Area']])

In [20]:
df1.head()

Unnamed: 0.1,Unnamed: 0,Marketing Spend,Administration,Transport,Area
0,0,114523.61,136897.8,471784.1,1
1,1,162597.7,151377.59,443898.53,0
2,2,153441.51,101145.55,407934.54,2
3,3,144372.41,118671.85,383199.62,1
4,4,142107.34,91391.77,366168.42,2


# One Hot Encdoing Manual

In [21]:
x_1 = df.drop('Profit', axis = 1)

In [22]:
x_1.head()

Unnamed: 0.1,Unnamed: 0,Marketing Spend,Administration,Transport,Area
0,0,114523.61,136897.8,471784.1,Dhaka
1,1,162597.7,151377.59,443898.53,Ctg
2,2,153441.51,101145.55,407934.54,Rangpur
3,3,144372.41,118671.85,383199.62,Dhaka
4,4,142107.34,91391.77,366168.42,Rangpur


In [23]:
def one_hot_encoded(df, col_name):
    
    u = set(df[col_name]) #To get unique catagories
    #print(u)
    
    one_hot = {}
    
    for catagory in u:
        
        encoded_value = []
        
        for x in df[col_name]:
            
            if x == catagory:
                encoded_value.append(1)
            else:
                encoded_value.append(0)
        
        one_hot[f'{col_name}_{catagory}'] = encoded_value 
        
    # need to create a data frame for one hot encoded dict
    one_hot_df = pd.DataFrame(one_hot)
        
    return one_hot_df
    
encoded_value = one_hot_encoded(x_1, 'Area')

In [24]:
result = pd.concat([x_1, encoded_value], axis=1)

In [25]:
result.head()

Unnamed: 0.1,Unnamed: 0,Marketing Spend,Administration,Transport,Area,Area_Rangpur,Area_Ctg,Area_Dhaka
0,0,114523.61,136897.8,471784.1,Dhaka,0,0,1
1,1,162597.7,151377.59,443898.53,Ctg,0,1,0
2,2,153441.51,101145.55,407934.54,Rangpur,1,0,0
3,3,144372.41,118671.85,383199.62,Dhaka,0,0,1
4,4,142107.34,91391.77,366168.42,Rangpur,1,0,0


# One Hot Encoding Pandas

In [26]:
df2 = df.copy()

In [27]:
pd.get_dummies(df2, drop_first=True, dtype=int)

Unnamed: 0.1,Unnamed: 0,Marketing Spend,Administration,Transport,Profit,Area_Dhaka,Area_Rangpur
0,0,114523.61,136897.8,471784.1,192261.83,1,0
1,1,162597.7,151377.59,443898.53,191792.06,0,0
2,2,153441.51,101145.55,407934.54,191050.39,0,1
3,3,144372.41,118671.85,383199.62,182901.99,1,0
4,4,142107.34,91391.77,366168.42,166187.94,0,1
5,5,131876.9,99814.71,362861.36,156991.12,1,0
6,6,134615.46,147198.87,127716.82,156122.51,0,0
7,7,130298.13,145530.06,323876.68,155752.6,0,1
8,8,120542.52,148718.95,311613.29,152211.77,1,0
9,9,123334.88,108679.17,304981.62,149759.96,0,0


In [28]:
df2.head()

Unnamed: 0.1,Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,1,162597.7,151377.59,443898.53,Ctg,191792.06
2,2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,4,142107.34,91391.77,366168.42,Rangpur,166187.94


# Ordinal Manual

In [29]:
df_3 = df.copy()

In [30]:
df_3.head()

Unnamed: 0.1,Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,1,162597.7,151377.59,443898.53,Ctg,191792.06
2,2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [31]:
df_3.drop('Profit', axis=1)

Unnamed: 0.1,Unnamed: 0,Marketing Spend,Administration,Transport,Area
0,0,114523.61,136897.8,471784.1,Dhaka
1,1,162597.7,151377.59,443898.53,Ctg
2,2,153441.51,101145.55,407934.54,Rangpur
3,3,144372.41,118671.85,383199.62,Dhaka
4,4,142107.34,91391.77,366168.42,Rangpur
5,5,131876.9,99814.71,362861.36,Dhaka
6,6,134615.46,147198.87,127716.82,Ctg
7,7,130298.13,145530.06,323876.68,Rangpur
8,8,120542.52,148718.95,311613.29,Dhaka
9,9,123334.88,108679.17,304981.62,Ctg


In [32]:
def ordinal_encoded(df, col_name, order):
    
    df[f'encoded_{col_name}'] = df[col_name].map(order)
    
    return df

ordinal_order = {'Ctg': 0, 'Dhaka': 1, 'Rangpur': 2}

ordnial = ordinal_encoded(df_3, 'Area', ordinal_order)


In [33]:
ordnial.head()

Unnamed: 0.1,Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,encoded_Area
0,0,114523.61,136897.8,471784.1,Dhaka,192261.83,1
1,1,162597.7,151377.59,443898.53,Ctg,191792.06,0
2,2,153441.51,101145.55,407934.54,Rangpur,191050.39,2
3,3,144372.41,118671.85,383199.62,Dhaka,182901.99,1
4,4,142107.34,91391.77,366168.42,Rangpur,166187.94,2


# Ordinal Encoding Sklearn

In [34]:
from sklearn.preprocessing import OrdinalEncoder

In [35]:
ol = OrdinalEncoder(categories = [["Dhaka", "Ctg", "Rangpur"]])

In [36]:
df3 = df.copy()

In [37]:
ordinal = ol.fit_transform(df[['Area']])

In [38]:
ordinal

array([[0.],
       [1.],
       [2.],
       [0.],
       [2.],
       [0.],
       [1.],
       [2.],
       [0.],
       [1.],
       [2.],
       [1.],
       [2.],
       [1.],
       [2.],
       [0.],
       [1.],
       [0.],
       [2.],
       [0.],
       [1.],
       [0.],
       [2.],
       [2.],
       [0.],
       [1.],
       [2.],
       [0.],
       [2.],
       [0.],
       [2.],
       [0.],
       [1.],
       [2.],
       [1.],
       [0.],
       [2.],
       [1.],
       [0.],
       [1.],
       [1.],
       [2.],
       [1.],
       [0.],
       [1.],
       [0.],
       [2.],
       [1.],
       [0.],
       [1.]])

In [39]:
df3.Area = ordinal

In [40]:
df3

Unnamed: 0.1,Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,0,114523.61,136897.8,471784.1,0.0,192261.83
1,1,162597.7,151377.59,443898.53,1.0,191792.06
2,2,153441.51,101145.55,407934.54,2.0,191050.39
3,3,144372.41,118671.85,383199.62,0.0,182901.99
4,4,142107.34,91391.77,366168.42,2.0,166187.94
5,5,131876.9,99814.71,362861.36,0.0,156991.12
6,6,134615.46,147198.87,127716.82,1.0,156122.51
7,7,130298.13,145530.06,323876.68,2.0,155752.6
8,8,120542.52,148718.95,311613.29,0.0,152211.77
9,9,123334.88,108679.17,304981.62,1.0,149759.96
