In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df=pd.read_csv('data.csv')
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


### Seperate the dependent and independent features

In [4]:
X=df.iloc[:,:-1].values ## takes all colums excluding the last column and takes all rows!!
y=df.iloc[:,-1].values   ## Takes the last column and all rows and stores in y

In [5]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [6]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

### Handling missing data

##### If there is 1% of missing data we can simply ignore and delete the particular rows

#### If there are more missing data then replace the nan value with the mean value of the column this is the classic way of handling missing data

In [7]:
from sklearn.impute import SimpleImputer

In [8]:
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
imputer.fit(X[:,1:3]) ## Taking the numerical columns and replacing them with the mean value so we are considering the columns 1 and 2 for its range will will be form 1 to 3 and all rows;
X[:,1:3]=imputer.transform(X[:,1:3])
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Encoding Categorical Data

In [9]:
## Encoding independent variables

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough') ## we provide encoder and typer of encoding and indexes for which these are applied, If we dont give reminder then the encoding is applied to the remaining two columns;
X=np.array(ct.fit_transform(X)) ## this should return numpy array thas why  np.array if coded
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [11]:
## Encoding dependent variables

In [12]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Splitting and training data

In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [17]:
y_train

array([0, 0, 0, 1, 0, 1, 1, 0])

### Feature Scaling

In [20]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train[:,3:]=sc.fit_transform(X_train[:,3:]) ## the fit function will calculate the mean and standard deviation and transform function will apply the normalisation formula
## Standardization X' = Xmax - Xi / Xmax - Xmin
X_test[:,3:]=sc.transform(X_test[:,3:]) ## we will apply the same mean and sd for the test data thats y we will only transform and wont apply fit function


In [21]:
X_train

array([[0.0, 1.0, 0.0, 1.5671919463844557, 1.6055197155811116],
       [0.0, 0.0, 1.0, -0.3319018007069463, -0.4420996318266829],
       [1.0, 0.0, 0.0, 0.6176450728387547, 0.5817100418772143],
       [1.0, 0.0, 0.0, -0.8066752374797969, -0.7213204519277457],
       [0.0, 1.0, 0.0, -1.597964298767881, -1.093614878729163],
       [1.0, 0.0, 0.0, 1.250676321869222, 1.2332252887796944],
       [1.0, 0.0, 0.0, -0.4901596129645631, 0.11634200837544287],
       [0.0, 0.0, 1.0, -0.20881239117324418, -1.2797620921298716]],
      dtype=object)

In [22]:
X_test

array([[0.0, 0.0, 1.0, -2.0727377355407315, -1.6520565189312888],
       [0.0, 1.0, 0.0, -0.015386176191712623, -0.18356183543680957]],
      dtype=object)