# Data prep Excersise

## Importing Libraries

In [45]:
import numpy as np # helps working with complex data structures, e-g- arrays
import matplotlib.pyplot as plt # plotting data
import pandas as pd # data frames

## Importing Data

In [46]:
dataset = pd.read_csv('Data - Copy.csv') # Todo: rename text string to your data file.
# dependent variable, or the 'outcome' (y) is often the LAST column.
# independent variables, or the 'features' (x) are the rest of the columns.
X = dataset.iloc[:, :-1].values # locate indexes = iloc, ':' range, all rows
# python include, lower bounds in ranges, but exclude upper bounds.
# skip 'values' to just print the data
y = dataset.iloc[:, -1].values # this will only get the last column

In [47]:
print(dataset)
print(type(X))


    Country   Age   Salary PurchasedProduct
0   Finland  42.0  45000.0              yes
1   Finland  45.0  39000.0              yes
2   Finland  30.0  30000.0              yes
3   Finland  34.0  41000.0              yes
4    Sweden  49.0  48000.0              yes
5    Sweden  40.0  38000.0              yes
6    Sweden  30.0  31000.0               no
7    Sweden  35.0  35000.0               no
8    Sweden  31.0  40000.0              yes
9    Sweden  35.0  37000.0               no
10   Sweden  36.0  35000.0               no
11   Sweden  36.0      NaN               no
12   Norway  45.0  46000.0              yes
13   Norway  39.0  33000.0               no
14   Norway  33.0  31000.0               no
15   Norway  32.0  42000.0              yes
16   Norway  37.0  35000.0               no
17   Norway  34.0  36000.0               no
18   Norway   NaN  39000.0               no
<class 'numpy.ndarray'>


In [48]:
# All from Finland buys.
# All customers over 40 buys.
# All customers with salary over 40 buys.

## Handle missing Data

In [49]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])

In [50]:
print(X)
print(type(X))

[['Finland' 42.0 45000.0]
 ['Finland' 45.0 39000.0]
 ['Finland' 30.0 30000.0]
 ['Finland' 34.0 41000.0]
 ['Sweden' 49.0 48000.0]
 ['Sweden' 40.0 38000.0]
 ['Sweden' 30.0 31000.0]
 ['Sweden' 35.0 35000.0]
 ['Sweden' 31.0 40000.0]
 ['Sweden' 35.0 37000.0]
 ['Sweden' 36.0 35000.0]
 ['Sweden' 36.0 37833.333333333336]
 ['Norway' 45.0 46000.0]
 ['Norway' 39.0 33000.0]
 ['Norway' 33.0 31000.0]
 ['Norway' 32.0 42000.0]
 ['Norway' 37.0 35000.0]
 ['Norway' 34.0 36000.0]
 ['Norway' 36.833333333333336 39000.0]]
<class 'numpy.ndarray'>


# Encoding Categorical Data

## Encoding the Independent Variable (x)

In [51]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# [0] : index 0, first column.
# passthrough meaning we will leave the other columns untouched.
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')

X = np.array(ct.fit_transform(X))

In [52]:
print(X)

[[1.0 0.0 0.0 42.0 45000.0]
 [1.0 0.0 0.0 45.0 39000.0]
 [1.0 0.0 0.0 30.0 30000.0]
 [1.0 0.0 0.0 34.0 41000.0]
 [0.0 0.0 1.0 49.0 48000.0]
 [0.0 0.0 1.0 40.0 38000.0]
 [0.0 0.0 1.0 30.0 31000.0]
 [0.0 0.0 1.0 35.0 35000.0]
 [0.0 0.0 1.0 31.0 40000.0]
 [0.0 0.0 1.0 35.0 37000.0]
 [0.0 0.0 1.0 36.0 35000.0]
 [0.0 0.0 1.0 36.0 37833.333333333336]
 [0.0 1.0 0.0 45.0 46000.0]
 [0.0 1.0 0.0 39.0 33000.0]
 [0.0 1.0 0.0 33.0 31000.0]
 [0.0 1.0 0.0 32.0 42000.0]
 [0.0 1.0 0.0 37.0 35000.0]
 [0.0 1.0 0.0 34.0 36000.0]
 [0.0 1.0 0.0 36.833333333333336 39000.0]]


## Encoding the Dependent Variable (y)

In [53]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y) # a np.array is not needed here.

In [54]:
print(y)

[1 1 1 1 1 1 0 0 1 0 0 0 1 0 0 1 0 0 0]


## Splitting the Dataset into Training and Test datasets


Splitting the data means that the training set will be used for training our algorithm(s). 
The test set will be the set we are using for evaluation of the model we created from the training set.

Note! 
Feature scaling should be applied after the split.
The reason why feature scaling will happen after the split, is that the test set should be considered as totally new data.
This new data should not be "biased" by that manipulation.
Feature scaling often includes getting e.g. the 'mean' value of something, if the test data set is included in this, 
it will make the training biased. Once again, the test set should be viewed as totally new data that arrives!

When information from the test set is interfering with the training set, it is called **information leakage**!

Note! Feature scaling is ALWAYS after the split!!!

In [55]:
# The library will split the data into 4 sets! x, x_test, y and y_test. 
# Split is betwenn test/data and dependent/independent variables.
from sklearn.model_selection import train_test_split

# Split 80% training obeservation data, 20% will be test.
# random_state ensure that the split will be the same each time we run it, to re-try our model during development.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1)


In [56]:
print(X_train)

[[1.0 0.0 0.0 30.0 30000.0]
 [0.0 1.0 0.0 33.0 31000.0]
 [0.0 0.0 1.0 49.0 48000.0]
 [0.0 1.0 0.0 37.0 35000.0]
 [0.0 0.0 1.0 35.0 35000.0]
 [1.0 0.0 0.0 45.0 39000.0]
 [0.0 1.0 0.0 39.0 33000.0]
 [1.0 0.0 0.0 42.0 45000.0]
 [0.0 1.0 0.0 36.833333333333336 39000.0]
 [0.0 1.0 0.0 34.0 36000.0]
 [0.0 0.0 1.0 35.0 37000.0]
 [0.0 0.0 1.0 31.0 40000.0]
 [0.0 1.0 0.0 45.0 46000.0]
 [0.0 0.0 1.0 36.0 37833.333333333336]
 [0.0 0.0 1.0 40.0 38000.0]]


In [57]:
print(X_test)

[[1.0 0.0 0.0 34.0 41000.0]
 [0.0 1.0 0.0 32.0 42000.0]
 [0.0 0.0 1.0 30.0 31000.0]
 [0.0 0.0 1.0 36.0 35000.0]]


In [58]:
print(y_train)

[1 0 1 0 0 1 0 1 0 0 0 1 1 0 1]


In [59]:
print(y_test)

[1 1 0 0]


# Feature Scaling
- Avoid biases in the scale of numbers. e.g. selling house, square meter vs prize. One feature migh be too dominant because of its large or low number.
- Main two feature scaling techniques: Standardisation or Normalistation
- Standardisation: all values minus the mean of the values, divided by the standard deviation.
    - Results in values between -3 and +3.
- Normalisation: all values minus the min value in the data set divided by the differense between max and min value.
    - Results: all values between 0 and 1
- Normalisation vs. Standardisation
    - Normalisation is the recommended one if your data is normally distributed.
    - Standardisation will always work.
    - Normalisation is only better on certain data sets.
    - Stick with standardisation, it is more pragmatic.
- The feature scaling will be APPLIED to the test set, but we are NOT allowed to use that data during training. This is extremely important.


## Standardisation
$$X_{stand} = \cfrac{x-mean(x)}{standard deviation(x)}$$
## Normalisation
$$X_{norm} = \cfrac{x-min(x)}{max(x)-min(x)}$$

## Standard Scaling
- Standard Scaling, pragmatic choice that will always work.
- Do we need to apply the featur scaling to the dummy variables in the data? - answer is NO.
    - The reason is that e.g. HotEncoding is already making the values 'standardized'. They are already in the same range and so, not biased. If applied, it will only make things worse actually!

In [60]:
from sklearn.preprocessing import StandardScaler

# Will automatically apply the correct function.
sc = StandardScaler() 
# fit and transform will create the scale and use it to transform the data.
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:]) 
# Here, we use the same scaler which is already fitted.Only transform is used on the test data.
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [61]:
print(X_train)

[[1.0 0.0 0.0 -1.4880221018481172 -1.587177132003664]
 [0.0 1.0 0.0 -0.9197534066585952 -1.3885040556749717]
 [0.0 0.0 1.0 2.111012967685522 1.9889382419127968]
 [0.0 1.0 0.0 -0.1620618130725658 -0.5938117503602027]
 [0.0 0.0 1.0 -0.5409076098655805 -0.5938117503602027]
 [1.0 0.0 0.0 1.353321374099493 0.2008805549545663]
 [0.0 1.0 0.0 0.21678398372044888 -0.9911579030175873]
 [1.0 0.0 0.0 0.7850526789099709 1.39291901292672]
 [0.0 1.0 0.0 -0.1936322961386499 0.2008805549545663]
 [0.0 1.0 0.0 -0.7303305082620878 -0.3951386740315105]
 [0.0 0.0 1.0 -0.5409076098655805 -0.19646559770281824]
 [0.0 0.0 1.0 -1.2985992034516098 0.39955363128325855]
 [0.0 1.0 0.0 1.353321374099493 1.591592089255412]
 [0.0 0.0 1.0 -0.3514847114690731 -0.030904700762240857]
 [0.0 0.0 1.0 0.4062068821169562 0.002207478625874037]]


In [62]:
print(X_test)

[[1.0 0.0 0.0 -0.7303305082620878 0.5982267076119508]
 [0.0 1.0 0.0 -1.1091763050551025 0.7968997839406431]
 [0.0 0.0 1.0 -1.4880221018481172 -1.3885040556749717]
 [0.0 0.0 1.0 -0.3514847114690731 -0.5938117503602027]]


## Training the Logistic Regression model on the Training set


In [63]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

## Predicting a new result

In [64]:
# Manually add new value
new_value = np.array([[1,0,0,42,45000]])
print(new_value)
print(type(new_value))
# Scale value
new_value[:, 3:] = sc.transform(new_value[:, 3:]) # Note that transform are being used here.
print(new_value)
print(type(new_value))
# predict the value
pred_new_value = classifier.predict(new_value)
print(pred_new_value)
print(type(pred_new_value))

[[    1     0     0    42 45000]]
<class 'numpy.ndarray'>
[[1 0 0 0 1]]
<class 'numpy.ndarray'>
[1]
<class 'numpy.ndarray'>


## Predicting the Test set results

In [65]:
y_pred = classifier.predict(X_test)
print("Prediction, Actual")
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

Prediction, Actual
[[1 1]
 [0 1]
 [0 0]
 [0 0]]


## Making the Confusion Matrix

In [66]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[2 0]
 [1 1]]


0.75

|         | Pred. Neg. | Pred. Pos. |
| --------| ---------- | ---------- |
| Act. Neg. | **2** | 0 |
| Act. Pos. | 1 | **1** |