#import libraries



In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd               #best library for loading data

#import data

In [4]:
dataset=pd.read_csv('/content/Data.csv')

In [5]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [6]:
# iloc returns a Pandas Series when one row is selected, and a Pandas DataFrame when multiple rows are selected,
# or if any column in full is selected.
X=dataset.iloc[:,:-1].values  #last one is dependent variable
y=dataset.iloc[:, 3].values  #3 independent variables

In [7]:
print(X.shape)
X

(10, 3)


array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [8]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

#Handling Missing Data

1. Can delete rows with missing data( can have large loss of data)
2. Replace missing data with mean of column ( we will do this)

The Imputer class provides basic strategies for imputing missing values.
Missing values can be imputed with a provided constant value, or using the statistics 
(mean, median or most frequent) of each column in which the missing values are located.
This class also allows for different missing values encodings.
https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

In [9]:
from sklearn.impute import SimpleImputer     #import class
# help(SimpleImputer)
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')    #replace NaN with mean of columns, default is mean

In [10]:
#fit imputer on data
imputer = imputer.fit(X[:,1:3])  #since in X data is missing in 2nd and 3rd columns (so 1:3 as 0 based indexing)
X[: ,1:3]=imputer.transform(X[:, 1:3])

In [11]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# Categorical data

since we only can work on numbers, so we have to convert string to number. Eg yes=1 no=0, or india=1,america=2, germany=3 ....,

LabelEncoder encode labels with a value between 0 and n_classes-1 where n is the number of distinct labels. If a label repeats it assigns the same value to as assigned earlier.


[LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html)

But if we do like france=0, germany=1,use=2. Our model may feel usa is better than germany and germany is better than france. But we dont want such relations. So we use **onehotencoding** and create dummy variables 

eg 3 categories:

france 1 0 0

germany 0 1 0

usa 0 0 1


**One hot encoding**
This is where the integer encoded variable is removed and a new binary variable is added for each unique integer value. In the “color” variable example, there are 3 categories and therefore 3 binary variables are needed.


In [12]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_X=LabelEncoder()   #create object
X[:,0]=labelencoder_X.fit_transform(X[:,0])      #apply on first column X[:, 0] :->all rows, 0th column

print(X[:,0])


[0 2 1 2 1 0 2 0 1 0]


In [13]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [14]:
#one hot encoding, using sklearn,this encode all columns in dataset
#since usually label columns are kept seperately this sklearn can be used, 
#sometimes there are thousand of labels at that time onehotencoding includes so much data, at such time take only most occuring labels
enc =OneHotEncoder()
# 2. FIT
enc.fit(X[:,0].reshape(-1,1))
p= enc.transform(X[:,0].reshape(-1,1)).toarray()
p

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [15]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [17]:
#using pandas to generate dummy variables(one hot encoding)
pd.get_dummies(X.reshape(-1))

Unnamed: 0,0.000000,1.000000,2.000000,27.000000,30.000000,35.000000,37.000000,38.000000,38.777778,40.000000,44.000000,48.000000,50.000000,48000.000000,52000.000000,54000.000000,58000.000000,61000.000000,63777.777778,67000.000000,72000.000000,79000.000000,83000.000000
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
9,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Visible from above data size increased drasctically if encoding applied on all columns.[Lecture](https://www.youtube.com/watch?v=6WDFfaYtN6s&t=633s). 

So we will apply on only first column

In [18]:
label_table=list(set(X[:,0]))
print(label_table)

[0, 1, 2]


In [39]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [41]:
#replaceing first col of X with 3 columns of p
p=pd.get_dummies(X[:,0].reshape(-1))

X_2=np.concatenate([p,X[:,1:]],axis=1)
X_2

array([[1, 0, 0, 44.0, 72000.0],
       [0, 0, 1, 27.0, 48000.0],
       [0, 1, 0, 30.0, 54000.0],
       [0, 0, 1, 38.0, 61000.0],
       [0, 1, 0, 40.0, 63777.77777777778],
       [1, 0, 0, 35.0, 58000.0],
       [0, 0, 1, 38.77777777777778, 52000.0],
       [1, 0, 0, 48.0, 79000.0],
       [0, 1, 0, 50.0, 83000.0],
       [1, 0, 0, 37.0, 67000.0]], dtype=object)

In [42]:
X=X_2
X

array([[1, 0, 0, 44.0, 72000.0],
       [0, 0, 1, 27.0, 48000.0],
       [0, 1, 0, 30.0, 54000.0],
       [0, 0, 1, 38.0, 61000.0],
       [0, 1, 0, 40.0, 63777.77777777778],
       [1, 0, 0, 35.0, 58000.0],
       [0, 0, 1, 38.77777777777778, 52000.0],
       [1, 0, 0, 48.0, 79000.0],
       [0, 1, 0, 50.0, 83000.0],
       [1, 0, 0, 37.0, 67000.0]], dtype=object)

In [44]:
#for y it is only yes and no, so only labelencoder is sufficient in this case
print(y)
labelencoder_y=LabelEncoder()   #create object
y=labelencoder_y.fit_transform(y)      
print(y)


['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']
[0 1 0 0 1 1 0 1 0 1]


#Splitting dataset into training set and test set

test set is set data on which model is not rained on, model should perform good on both training and test dataset. That will mean that model has understood the correlation correctly and can even perform on random data set.

data=60% training, 20% validation, 20%test

In [53]:
X_train=X[:8,:]
X_test=X[8:,:]
print(X_train.shape,X_test.shape)

(8, 5) (2, 5)


In [54]:
y_train=y[:8]
y_test=y[8:]
print(y_train.shape,y_test.shape)

(8,) (2,)


#Feature Scaling

In our dataset age range(27,50) and salary ranges(52000,83000). Let if model consider age as x and salary as y and trains on basis of distance , so ans will be dominated by salary.

So we will try that all have been scaled down to same range.
eg scale down pixel value(0-255) to 0-1 by dividing with 255.0.


Data standardization is the process of rescaling the attributes so that they have mean as 0 and variance as 1.

[Know More](https://towardsdatascience.com/what-and-why-behind-fit-transform-vs-transform-in-scikit-learn-78f915cf96fe)

In [56]:
#scale down salary and age
#here even dummy variables has been scaled down
from sklearn.preprocessing import StandardScaler
sc_X=StandardScaler()
X_train_1=sc_X.fit_transform(X_train)
X_test_1=sc_X.fit_transform(X_test)
print(X_train_1)

[[ 1.29099445 -0.57735027 -0.77459667  0.9919531   1.13089809]
 [-0.77459667 -0.57735027  1.29099445 -1.64177922 -1.33030078]
 [-0.77459667  1.73205081 -0.77459667 -1.17700292 -0.71500106]
 [-0.77459667 -0.57735027  1.29099445  0.06240052  0.00284861]
 [-0.77459667  1.73205081 -0.77459667  0.37225138  0.28770959]
 [ 1.29099445 -0.57735027 -0.77459667 -0.40237577 -0.30480125]
 [-0.77459667 -0.57735027  1.29099445  0.18289808 -0.92010097]
 [ 1.29099445 -0.57735027 -0.77459667  1.61165483  1.84874777]]


In [57]:
#sometimes we dont want to scale down dummy variables at thet time
X_train[:,3:]=sc_X.fit_transform(X_train[:,3:])
X_test[:,3:]=sc_X.fit_transform(X_test[:,3:])
print(X_train)

[[1 0 0 0.99195310407169 1.1308980938355184]
 [0 0 1 -1.6417792156327535 -1.3303007804060123]
 [0 1 0 -1.1770029239202047 -0.7150010618456295]
 [0 0 1 0.06240052064659233 0.0028486098081502514]
 [0 1 0 0.3722513817882916 0.28770959062314255]
 [1 0 0 -0.4023757710659565 -0.30480124947204107]
 [0 0 1 0.18289807775725328 -0.9201009680324238]
 [1 0 0 1.6116548263550885 1.8487477654892983]]


In [None]:
#Since this data is for classification problem no need to scale y,
#however in regression problem we will need to scale down y also