In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
dataset = pd.read_csv('/content/Data.csv')

In [5]:
X = dataset.iloc[:,:-1].values  #taking multiple columns returns a matrix of those columns
y = dataset.iloc[:,-1].values   #taking a single column returns a single vector array

In [6]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [14]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [7]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer.fit(X[:,1:3]) #spots the missing values and calculates the mean
X[:,1:3] = imputer.transform(X[:,1:3]) 
 
#transforms the missing  values with the mean

In [8]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [9]:
print(np.nan)

nan


**ENCODING CATEGORICAL DATA**

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#since we have more than two categories on countries we need to perform onehot encoding
# ct = ColumnTransformer(#transformers<whatkindoftransformation>, remainder=<wewanttokeepthercolumnsthathavenotransformation>)

ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')

#connecting our obejct to our matrix of features X 
#fit transform does not return the output as a numpy array
#so we need to convert it into an array
X = np.array(ct.fit_transform(X)) 

In [11]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


**Encoding for our single vector dependent variable**

In [16]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder() #on input - only converting one single vector
y = le.fit_transform(y) #no numpy, it doesnt need to be a numpy array

In [17]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


# Splitting the data set into training set and test set

feature scaling only need to be applied after the split 
why? = the test set is supposed to be a brand new set
we are not supposed to train our set on the test set

In [18]:
from sklearn.model_selection import train_test_split

# 20% observation will go into the test set
# random_state = 1 -> It means every time we run code with random_state value 1, it will produce the same splitting datasets

X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.2, random_state = 1)

In [19]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [20]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [22]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [21]:
print(y_test)

[0 1]


# FEATURE SCALING
puts all our features on the same scale, in order to avoid features to be dominating other features

#normalisation
recommended for normal features

#standardization 
works well all of the time

**we donot need to apply feature scaling to dummy variables**


we dont use the fit method on our test set bcoz we dont want to calculate a new scaler for our test set
test set is used for testing and not for training our data
test set needs to be scaled on the same scaler that was used on the test set
we need to apply the same scaler in which the model was trained, not an entirely new scaler

In [23]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

#calculates mean and standard deviation on all the columns
#transfoms into standarad

X_train[:,3:] = sc.fit_transform(X_train[:,3:])

X_test[:,3:] = sc.transform(X_test[:,3:])

In [24]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [25]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
