In [3]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('/content/data.csv')
df

Unnamed: 0,country,age,salary,purchased
0,france,40,72000,No
1,spain,27,48000,Yes
2,germany,30,54000,No
3,spain,38,61000,No
4,germany,40,Nan,Yes
5,france,35,58000,Yes
6,spain,Nan,52000,No
7,france,48,79000,Yes
8,germany,50,83000,No
9,france,37,67000,Yes


In [5]:
X = df[['country','age ','salary']].values

In [6]:
X# after typing .values we will get array

array([['france', '40', '72000'],
       ['spain', '27', '48000'],
       ['germany', '30', '54000'],
       ['spain', '38', '61000'],
       ['germany', '40', 'Nan'],
       ['france', '35', '58000'],
       ['spain', 'Nan', '52000'],
       ['france', '48', '79000'],
       ['germany', '50', '83000'],
       ['france', '37', '67000']], dtype=object)

In [7]:
y = df[['purchased']].values

In [8]:
y

array([['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes']], dtype=object)

In [9]:
from sklearn.impute import SimpleImputer 

In [10]:
imputer = SimpleImputer(missing_values=np.nan,strategy='mean') #we're keeping np.nan bcoz our data in array form

In [11]:
imputer

SimpleImputer()

In [12]:
imputer = imputer.fit(X[:,1:3]) # x[:,1:3] means we're taking all rows for this (:) and we have null values in the columns of age and salary so if we take (1:3) we will include columns of country , age, salary

In [13]:
X[:,1:3] = imputer.transform(X[:,1:3]) #this transform will help you replace the missing values with mean values 

In [14]:
X[:,1:3]

array([[40.0, 72000.0],
       [27.0, 48000.0],
       [30.0, 54000.0],
       [38.0, 61000.0],
       [40.0, 63777.77777777778],
       [35.0, 58000.0],
       [38.333333333333336, 52000.0],
       [48.0, 79000.0],
       [50.0, 83000.0],
       [37.0, 67000.0]], dtype=object)

In [15]:
#encoding categorical data

from sklearn.preprocessing import LabelEncoder#this labelencoder will helps to convert non-numerical labels to numeric labels

label_encoder_X=LabelEncoder()

In [16]:
X[:,0] = label_encoder_X.fit_transform(X[:,0]) # we have to use fit_transform bcoz The fit_transform() method does both 
#fits and transform we have to prefer fit_transform in label_encoder it's best 
# x[:,0] means we're taking all rows and secting the column 'country' = 0th column

In [17]:
X[:,0]

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0], dtype=object)

In [18]:
X #now if you see clearly france is 0,spain is 2, germany is 1 

array([[0, 40.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.333333333333336, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [19]:
#after converting this our ml sometimes confuse bcoz france is 0,spain is 2, germany is 1 so it may think spain is greater than 
#france bcoz 0 and 1 so to solve this we use dummy encoding.
#If it was decision tree it will not confuse and we don't need to change this into dummy encoding
from sklearn.preprocessing import OneHotEncoder

onehotencode_X = OneHotEncoder()

In [20]:
onehotencode_X.fit_transform(df.country.values.reshape(-1,1)).toarray()

#        france   germany  spain
# france     1        0       0
#                                    ----->>  This is dummy encoding 
# spain      0        0       1
                                
# germany    0        1       0

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [21]:
label_encoder_y = LabelEncoder()

In [22]:
y = label_encoder_y.fit_transform(y)

  y = column_or_1d(y, warn=True)


In [23]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [24]:
#split the data into training and testing data

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X, y,test_size = 0.2,random_state = 0)

In [25]:
X_train

array([[1, 40.0, 63777.77777777778],
       [0, 37.0, 67000.0],
       [2, 27.0, 48000.0],
       [2, 38.333333333333336, 52000.0],
       [0, 48.0, 79000.0],
       [2, 38.0, 61000.0],
       [0, 40.0, 72000.0],
       [0, 35.0, 58000.0]], dtype=object)

In [26]:
X_test

array([[1, 30.0, 54000.0],
       [1, 50.0, 83000.0]], dtype=object)

In [27]:
y_train

array([1, 1, 1, 0, 1, 0, 0, 1])

In [28]:
y_test

array([0, 0])

In [29]:
from sklearn.preprocessing import StandardScaler

In [30]:
sc_x = StandardScaler()

In [31]:
X_train = sc_x.fit_transform(X_train)

In [32]:
X_test = sc_x.transform(X_test)

In [33]:
X_train #we just get the scaled value for X_train in feature scaling or standard scaler

array([[ 0.13483997,  0.3811135 ,  0.12381479],
       [-0.94387981, -0.16768994,  0.46175632],
       [ 1.21355975, -1.99703475, -1.53093341],
       [ 1.21355975,  0.0762227 , -1.11141978],
       [-0.94387981,  1.84458935,  1.7202972 ],
       [ 1.21355975,  0.01524454, -0.16751412],
       [-0.94387981,  0.3811135 ,  0.98614835],
       [-0.94387981, -0.5335589 , -0.48214934]])

In [34]:
X_test

array([[ 0.13483997, -1.44823131, -0.90166297],
       [ 0.13483997,  2.21045831,  2.13981082]])