# Import the libraries
* Numpy : Work with arrays
* Matplotlib : used for plotting
* Pandas : import dataset and create features matrix


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Import the dataset

In [2]:
data_project = pd.read_csv('Data.csv')

In Python you need to create 2 items for your predictive model:

* Matrix of Features
* Dependent Variable Vector

# Indexing in Python

* Use iloc - which is from the pandas library
* Notation : iloc[begin row : end row -1, being column : end column - 1]
* Python starts rows and columns at 0
* Use -1 to designate last column (row)

# The line below creates the Matrix of Features.

* This is called Features
* Comprised of all rows and all but the last column of the dataframe named "data_project".

In [4]:
Features = data_project.iloc[:, :-1].values

# The line below creates the Dependent Variable Vector:

* Don't denote range in the column selection.  Just want a single column.
* Use -1 to select final column.

In [5]:
Dependent = data_project.iloc[:, -1].values

# Use print function to inspect newly created elements.

In [6]:
print(Features)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [7]:
print(Dependent)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


# Classes, Objects, and Methods

* Class: The model of something we want to build.
* Object: Instance of the class.
* Method: tool we use on object to complete a specific action.

# Take Care of Missing Data

### In the example below:

* Class: SimpleImputer that is imported from the impute module of the sklearn library.
* Object: This is "imputer" which we create as an instance of the SimpleImputer Class.
* Method: Using 2 - fit method and transform method.
    * Fit: connects the method to the object.
    * Transform: applies the method.
    * Sometimes these are done separately.  Some metods allow this to be done in a single step.

In [11]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy = 'mean')
imputer.fit(Features[:,1:3])
Features[:, 1:3] =imputer.transform(Features[:,1:3])

# Categorical Data

* Anytime you call a class - you need parenthesese.
* Anything in the parenthesese are arguments.
* In class ColumnTransformer below - actually only 2 arguments:  transformers and remainder. 
    * Transformers:  specify 3 things - kind of transformation ; what kind of encoding ; index of columns to encode.
    * Remainder:  passthrough tells you to keep columns you don't apply transformation (One Hot Encoding) to.
* Need output as np.array - for ML algorithms.
* Apply fit_transform (1 step).

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(),[0])], remainder = 'passthrough')
Features = np.array(ct.fit_transform(Features))   

In [17]:
print(Features) 

[[0.0 1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [0.0 1.0 0.0 0.0 35.0 58000.0]
 [1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 0.0 48.0 79000.0]
 [1.0 0.0 1.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 0.0 37.0 67000.0]]


# MY ONE HOT ENCODING DOES NOT LOOK CORRECT - MULTIPLE 1 VALUES?!?!?  RE-RUN!!

# Encode the dependent variable.
* Change YES/NO to 1/0

In [20]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Dependent = le.fit_transform(Dependent)

In [21]:
print(Dependent)

[0 1 0 0 1 1 0 1 0 1]


# SPLIT INTO TEST AND TRAIN SETS
* Apply feature scaling AFTER data split - do not want leakage.

In [22]:
from sklearn.model_selection import train_test_split
Features_train, Features_test, Dependent_train, Dependent_test = train_test_split(Features, Dependent, test_size = 0.2, random_state = 1)


In [23]:
print(Features_train)

[[1.0 0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 1.0 0.0 40.0 63777.77777777778]
 [0.0 1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 0.0 48.0 79000.0]
 [1.0 0.0 1.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 0.0 35.0 58000.0]]


# Feature Scaling
* Some models - will be dominated by largest magnitude factors. 
* Course will discuss which models require/recommend this.
* 2 types of Feature Scaling:
    * Standardisation : Converts change to standard deviations.
    * Normalisation : Converts change to proportion of variable range.