In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pylot

In [2]:
data = pd.read_csv("data.csv")
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [3]:
data["Age"].isnull()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7    False
8    False
9    False
Name: Age, dtype: bool

In [4]:
x = data.iloc[:,:-1].values
x


array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [5]:
y = data.iloc[:,-1].values
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [6]:
# # Fill NaN values in all numeric columns with their respective column means
# data = data.fillna(data.mean(numeric_only=True))
# data


In [7]:
data["Age"].mean()


np.float64(38.77777777777778)

In [8]:
data["Age"].fillna(data["Age"].mean())


0    44.000000
1    27.000000
2    30.000000
3    38.000000
4    40.000000
5    35.000000
6    38.777778
7    48.000000
8    50.000000
9    37.000000
Name: Age, dtype: float64

**Numeric data**

**Missing Values**

In [9]:
from sklearn.impute import SimpleImputer
im = SimpleImputer()
x[:,1:] = im.fit_transform(x[:,1:])

In [10]:
x


array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

**String**

In [11]:

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')
x = (ct.fit_transform(x))
x
     

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

**label == binary(yes,no)**

In [12]:

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

**Train Test Split**

In [13]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size= 0.2,random_state =42)

In [14]:
x_train

array([[1.0, 0.0, 0.0, 35.0, 58000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0]], dtype=object)

**Feature Scaling**

*Standardization*

In [15]:

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
z_train = sc.fit_transform(x_train[:,3:])
z_test = sc.transform(x_test[:, 3:])

In [16]:
z_test

array([[ 2.18271808,  2.30089209],
       [-2.3186283 , -1.79680973]])

In [17]:
z_train

array([[-0.7529426 , -0.62603778],
       [ 1.00845381,  1.01304295],
       [ 1.79129666,  1.83258331],
       [-1.73149616, -1.09434656],
       [-0.36152118,  0.42765698],
       [ 0.22561096,  0.05040824],
       [-0.16581046, -0.27480619],
       [-0.01359102, -1.32850095]])

*Normalization*

In [19]:
from sklearn.preprocessing import MinMaxScaler
nm = MinMaxScaler()
v_train = nm.fit_transform(x_train[:,3:])
v_test = nm.transform(x_test[:, 3:])

In [21]:
v_train

array([[0.27777778, 0.22222222],
       [0.77777778, 0.74074074],
       [1.        , 1.        ],
       [0.        , 0.07407407],
       [0.38888889, 0.55555556],
       [0.55555556, 0.43621399],
       [0.44444444, 0.33333333],
       [0.48765432, 0.        ]])

In [20]:
v_test

array([[ 1.11111111,  1.14814815],
       [-0.16666667, -0.14814815]])