## Data Preprocessing

In [9]:
# Importing the libraries

import numpy as np #library for mathematics
import pandas as pd #import and manage datasets 
import matplotlib.pyplot as plt #visualization
from sklearn import impute # for missing data

'''
LabelEncoder, # transform categorical variables
OneHotEncoder # create dummy variables
StandardScaler # for feature scaling
'''
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

from sklearn.model_selection import train_test_split # Splitting data into train and test sets



In [2]:
# Importing the dataset
dataset = pd.read_csv('Data.csv')

In [3]:
'''
.iloc[rows, columns]

iloc[:, : -1]
    -- take all the rows
    -- take all the columns till the last one but not the last one

'''
X = dataset.iloc[:, :-1].values # independent variable vector
y = dataset.iloc[:,-1].values # dependent variable vector

In [4]:
# Missing data
'''
axis = 0, along columns
axis = 1, along rows
'''
imputer = impute.SimpleImputer(missing_values = np.nan, 
                  strategy = 'mean')

imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [5]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [6]:
# Encoding categorical data

labelEncoderX = LabelEncoder()
X[:, 0] = labelEncoderX.fit_transform(X[:, 0])

# categorical_features = [0], here 0 is the column index to one-hot categories
oneHotEncoder = OneHotEncoder(categorical_features = [0])
# Here we don't need to put X[0] as we have already done it above
X = oneHotEncoder.fit_transform(X).toarray()

labelEncoderY = LabelEncoder()
y = labelEncoderY.fit_transform(y)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [7]:
print(X)

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01
  7.20000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01
  4.80000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01
  5.40000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01
  6.10000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01
  6.37777778e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01
  5.80000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01
  5.20000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01
  7.90000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01
  8.30000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01
  6.70000000e+04]]


In [8]:
# Splitting the dataset into Training and Test set

# random_state gets us the same result everytime, kind of like seed()
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [10]:
# Feature Scaling
scX = StandardScaler()
XTrain = scX.fit_transform(xTrain)
XTest = scX.transform(xTest)