In [1]:
import numpy as np
import pandas as pd

### Removing values like '-' from the dataset

In [2]:
data = pd.DataFrame({'A': [1,2,'-'], 'B': ['-', 10, 20], 'C': [45, '-', 52]})
data

Unnamed: 0,A,B,C
0,1,-,45
1,2,10,-
2,-,20,52


In [3]:
data.replace('-', np.nan)

Unnamed: 0,A,B,C
0,1.0,,45.0
1,2.0,10.0,
2,,20.0,52.0


In [4]:
# importing the dataset
data = pd.read_csv('Data/preprocessingData.csv')
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [5]:
# check the null values
data.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [6]:
# extraction of dependent and indeoendent
X = data.iloc[:, :3]
y = data.iloc[:, 3]

print('Independent Variables:\n', X.head())
print('Dependent Variable:\n', y.head())

Independent Variables:
    Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN
Dependent Variable:
 0     No
1    Yes
2     No
3     No
4    Yes
Name: Purchased, dtype: object


In [7]:
# handling missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer() # object created

imputer = imputer.fit(X.iloc[:, 1:3])

X.iloc[:, 1:3] = imputer.transform(X.iloc[:, 1:3])

X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [8]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()

X.iloc[:, 0] = label.fit_transform(X.iloc[:, 0])

X

Unnamed: 0,Country,Age,Salary
0,0,44.0,72000.0
1,2,27.0,48000.0
2,1,30.0,54000.0
3,2,38.0,61000.0
4,1,40.0,63777.777778
5,0,35.0,58000.0
6,2,38.777778,52000.0
7,0,48.0,79000.0
8,1,50.0,83000.0
9,0,37.0,67000.0


In [9]:
# One hot encoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([('Country', OneHotEncoder(), [0])], remainder='passthrough')
# reminder='passthrough' passes the remaining columns as it is

X = ct.fit_transform(X)
X

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

In [10]:
# avoiding dummy variable trap
X = X[:, 1:]
X

array([[0.00000000e+00, 0.00000000e+00, 4.40000000e+01, 7.20000000e+04],
       [0.00000000e+00, 1.00000000e+00, 2.70000000e+01, 4.80000000e+04],
       [1.00000000e+00, 0.00000000e+00, 3.00000000e+01, 5.40000000e+04],
       [0.00000000e+00, 1.00000000e+00, 3.80000000e+01, 6.10000000e+04],
       [1.00000000e+00, 0.00000000e+00, 4.00000000e+01, 6.37777778e+04],
       [0.00000000e+00, 0.00000000e+00, 3.50000000e+01, 5.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 3.87777778e+01, 5.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 4.80000000e+01, 7.90000000e+04],
       [1.00000000e+00, 0.00000000e+00, 5.00000000e+01, 8.30000000e+04],
       [0.00000000e+00, 0.00000000e+00, 3.70000000e+01, 6.70000000e+04]])

In [11]:
# splitting the dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 2)

In [12]:
# feature scalling
from sklearn.preprocessing import StandardScaler
st = StandardScaler()

X_train = st.fit_transform(X_train)
X_test = st.transform(X_test)

X_train

array([[-0.57735027, -0.57735027, -0.80667524, -0.72132045],
       [-0.57735027, -0.57735027,  0.61764507,  0.58171004],
       [-0.57735027, -0.57735027,  1.25067632,  1.23322529],
       [ 1.73205081, -0.57735027, -1.5979643 , -1.09361488],
       [-0.57735027,  1.73205081, -0.3319018 , -0.44209963],
       [-0.57735027,  1.73205081, -0.20881239, -1.27976209],
       [-0.57735027, -0.57735027, -0.49015961,  0.11634201],
       [ 1.73205081, -0.57735027,  1.56719195,  1.60551972]])