In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [36]:
data=pd.read_csv('../Data/Data.csv')

In [37]:
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [38]:
data.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

# Split dependent and Independent Variables 
Normally Dependent variables are in the last column of the table

In [40]:
x=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

In [41]:
print(x)
print(y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


# Fill Missing Values
## we use imputer from sklearn
We use (from sklearn.imputer import SimpleImputer) library.  
Parameters are missing_values=np.nan, strategy=mean/median.   
We use fit method to fit the updated values in that column and use transform method to apply changes

In [42]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan, strategy='mean') #to fill all missing values by mean of that column
imputer.fit(data.iloc[:,1:3])
data.iloc[:,1:3]=imputer.transform(data.iloc[:,1:3])

imputer.fit(x[:,1:3])
x[:,1:3]=imputer.transform(x[:,1:3])


In [8]:
print(data)

   Country        Age        Salary Purchased
0   France  44.000000  72000.000000        No
1    Spain  27.000000  48000.000000       Yes
2  Germany  30.000000  54000.000000        No
3    Spain  38.000000  61000.000000        No
4  Germany  40.000000  63777.777778       Yes
5   France  35.000000  58000.000000       Yes
6    Spain  38.777778  52000.000000        No
7   France  48.000000  79000.000000       Yes
8  Germany  50.000000  83000.000000        No
9   France  37.000000  67000.000000       Yes


In [43]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


# Encoding categorical data

In [44]:
pd.get_dummies(data) # One method for One-Hot Encoding

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain,Purchased_No,Purchased_Yes
0,44.0,72000.0,1,0,0,1,0
1,27.0,48000.0,0,0,1,0,1
2,30.0,54000.0,0,1,0,1,0
3,38.0,61000.0,0,0,1,1,0
4,40.0,63777.777778,0,1,0,0,1
5,35.0,58000.0,1,0,0,0,1
6,38.777778,52000.0,0,0,1,1,0
7,48.0,79000.0,1,0,0,0,1
8,50.0,83000.0,0,1,0,1,0
9,37.0,67000.0,1,0,0,0,1


## the scikit-learn Python machine learning library provides the ColumnTransformer that allows you to selectively apply data transforms to different columns in your dataset.
## Column Transformer is a sciket-learn class used to create and apply separate transformers for numerical and categorical data. 

ColumnTransformer:
parameters:


i) transformer[kind of transformation(eg:encoding),Type pf Encoding, index of Columns to be encoded].                                   
ii) remainder=(passthrough(codename)=> to keep the columns that wont be applied some transformation(onehotencoded)).

We use OnehotEncoder for Categorical Variable(Which has categories)
## A dummy variable is a binary variable that indicates whether a separate categorical variable takes on a specific value.


We do OnehotEncoder when the data is not in binary format

In [45]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
z = np.array(ct.fit_transform(data))
x=np.array(ct.fit_transform(x))

In [46]:
print(z)

[[1.0 0.0 0.0 44.0 72000.0 'No']
 [0.0 0.0 1.0 27.0 48000.0 'Yes']
 [0.0 1.0 0.0 30.0 54000.0 'No']
 [0.0 0.0 1.0 38.0 61000.0 'No']
 [0.0 1.0 0.0 40.0 63777.77777777778 'Yes']
 [1.0 0.0 0.0 35.0 58000.0 'Yes']
 [0.0 0.0 1.0 38.77777777777778 52000.0 'No']
 [1.0 0.0 0.0 48.0 79000.0 'Yes']
 [0.0 1.0 0.0 50.0 83000.0 'No']
 [1.0 0.0 0.0 37.0 67000.0 'Yes']]


In [47]:
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [48]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,-1])], remainder='passthrough')# Multiple columns
f=np.array(ct.fit_transform(data),dtype=np.str)
f

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  f=np.array(ct.fit_transform(data),dtype=np.str)


array([['1.0', '0.0', '0.0', '1.0', '0.0', '44.0', '72000.0'],
       ['0.0', '0.0', '1.0', '0.0', '1.0', '27.0', '48000.0'],
       ['0.0', '1.0', '0.0', '1.0', '0.0', '30.0', '54000.0'],
       ['0.0', '0.0', '1.0', '1.0', '0.0', '38.0', '61000.0'],
       ['0.0', '1.0', '0.0', '0.0', '1.0', '40.0', '63777.77777777778'],
       ['1.0', '0.0', '0.0', '0.0', '1.0', '35.0', '58000.0'],
       ['0.0', '0.0', '1.0', '1.0', '0.0', '38.77777777777778',
        '52000.0'],
       ['1.0', '0.0', '0.0', '0.0', '1.0', '48.0', '79000.0'],
       ['0.0', '1.0', '0.0', '1.0', '0.0', '50.0', '83000.0'],
       ['1.0', '0.0', '0.0', '0.0', '1.0', '37.0', '67000.0']],
      dtype='<U32')

## If the data is in the form yes/no(0 or 1 format )then we use LabelEncoder
## To convert Strings to 0 and 1 we use LabelEncoder

# The dependent variable is the variable that is being measured or tested in an experiment.

 For example, in a study looking at how tutoring impacts test scores, the dependent variable would be the participants' test scores, since that is what is being measured.

In [49]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)

In [50]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


# Test Train Split

## Training set is used to train the ML Model on existing observations and Test set is used to evaluate the perfoemance of the models on new observations

## Feature Scaling: Scaling all variables(features) to make sure they all take values in same scale to prevent one feature to be dominated which will be neglected by Ml Model(technique to get mean and SD of features)

feature scaling should not be applied before test train split to prevent information leakage on test set

X_Train:MAtrix of features of Training set.(One Hot encoded).                                        
X_Test:Matrix of Features of Test set.                            
Y_train:Dependent Variable of Train Set.                             
Y_Test:Dependent Variable of Tet set.                                  

## train_test_split():
parameters=arrays,test_size,random_state=1(same random factors)


In [51]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.2,random_state=1)

In [52]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [53]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [54]:
print(Y_train)

[0 1 0 0 1 1 0 1]


In [55]:
print(Y_test)

[0 1]


## Practice
for test train split firs tsplit the dataset into non categorical and categorical variables

In [56]:
g=data.drop(columns='Purchased',axis=1)
h=data['Purchased']

In [57]:
print(g)
print(h)

   Country        Age        Salary
0   France  44.000000  72000.000000
1    Spain  27.000000  48000.000000
2  Germany  30.000000  54000.000000
3    Spain  38.000000  61000.000000
4  Germany  40.000000  63777.777778
5   France  35.000000  58000.000000
6    Spain  38.777778  52000.000000
7   France  48.000000  79000.000000
8  Germany  50.000000  83000.000000
9   France  37.000000  67000.000000
0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


In [58]:
h=le.fit_transform(h)

In [59]:
print(h)

[0 1 0 0 1 1 0 1 0 1]


In [60]:
g_train,g_test,h_train,h_test=train_test_split(g,h,test_size=0.2,random_state=1)

In [61]:
print(g_train)

   Country        Age        Salary
6    Spain  38.777778  52000.000000
4  Germany  40.000000  63777.777778
0   France  44.000000  72000.000000
3    Spain  38.000000  61000.000000
1    Spain  27.000000  48000.000000
7   France  48.000000  79000.000000
8  Germany  50.000000  83000.000000
5   France  35.000000  58000.000000


In [62]:
print(g_test)

   Country   Age   Salary
2  Germany  30.0  54000.0
9   France  37.0  67000.0


In [63]:
print(h_train)

[0 1 0 0 1 1 0 1]


In [64]:
print(h_test)

[0 1]


# Feature Scaling

standardisation= x-mean(x)/standard deviation(x).(all the time).                                                          
normalisation=x-min(x)/max(x)-min(x)(features following normal distribution)

We use feature scaling on columns other than dummy variables or One hot encoded columns

In [65]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train[:,3:]=sc.fit_transform(X_train[:,3:])
X_test[:,3:]=sc.fit_transform(X_test[:,3:])

In [66]:
print(X_test)

[[0.0 1.0 0.0 -1.0 -1.0]
 [1.0 0.0 0.0 1.0 1.0]]


In [67]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]
