### Preprocessing data of customer purchase decisions

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Preprocessing our data for our algorithm to learn from and be able to predict if future customers would make a purchase based on our independent variables 


- We will train our IV/inputs/features (country/age and salary) to be able to predict the dependent variable/output/label(purchased).

In [2]:
#importing dataset

customer_data = pd.read_csv('customers_data.csv')
customer_data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### Segregating IV and DV

In [3]:
# another way of indexing
# a = customer_data.loc[:,:'Salary'].values
# a
# b = customer_data.loc[:,'Purchased']>values
# b

In [4]:
#x, independent variable and y, dependent variables
# in preprocessing we do not need the colum names that is why we use .values to just call out our values.
x = customer_data.iloc[:,:-1].values
y = customer_data.iloc[:,3].values

In [5]:
#calling out variables
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [6]:
#calling out variables
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

### Data Preprocessing

#### a. Handling missing data values

In [7]:
#estimating the values of all missing values, NANs in the numerical feature
#we know country feature doesn't have a missing value so no need to include it
from sklearn.impute import SimpleImputer

imputed_values = SimpleImputer(missing_values=np.nan, strategy = 'mean')
imputed_values = imputed_values.fit(x[:,1:3])
x[:,1:3] = imputed_values.transform(x[:,1:3]) #missing values get transformed here

#x
#if you want to see it a a dataframe
#pd.DataFrame(x)

### Encoding our categorical data

In [8]:
#encoding categorical features[country & purchased] using LabelEncoder and OneHotEncoder
#labelencoder is used for ordered encoding meaning it assumes the last input as greaater than the previous
##Onehotencoder just uses binary encoding does not assume an order

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer

#we transform the country column using onehotencoder because we do not want our values to be ranked. 
column_transf = make_column_transformer((OneHotEncoder(),[0]), remainder = 'passthrough')
x = column_transf.fit_transform(x)


#we transform the purchased column ,our dependendent since we want it to be ranked
LabelEncoder_y = LabelEncoder()
y = LabelEncoder_y.fit_transform(y)

In [9]:
pd.DataFrame(x)

Unnamed: 0,0,1,2,3,4
0,1.0,0.0,0.0,44.0,72000.0
1,0.0,0.0,1.0,27.0,48000.0
2,0.0,1.0,0.0,30.0,54000.0
3,0.0,0.0,1.0,38.0,61000.0
4,0.0,1.0,0.0,40.0,63777.777778
5,1.0,0.0,0.0,35.0,58000.0
6,0.0,0.0,1.0,38.777778,52000.0
7,1.0,0.0,0.0,48.0,79000.0
8,0.0,1.0,0.0,50.0,83000.0
9,1.0,0.0,0.0,37.0,67000.0


In [10]:
pd.DataFrame(y)

Unnamed: 0,0
0,0
1,1
2,0
3,0
4,1
5,1
6,0
7,1
8,0
9,1


### Splitting training set and test set


#### Note: For very large datasets, 80/20% to 90/10% as training set/test set should be fine however, for small dimensional datasets, you might want to use something like 60/40% to 70/30% as training set/test set

In [11]:
#spliting our data to trainin set and test set

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, random_state = 0)

In [12]:
#pd.DataFrame(x_train)
pd.DataFrame(x_test)
#pd.DataFrame(y_train)
#pd.DataFrame(y_train)

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,0.0,30.0,54000.0
1,0.0,1.0,0.0,50.0,83000.0
2,0.0,1.0,0.0,40.0,63777.777778


### Feature Scaling


- Scaling all features to use 1 normal scale. A common scale is -1 and 1.
- We can either use the Standardisation or Normalisation for the feature scaling

- SCaling helps your machine model to learn at a faster rate as compared to learning from and computing the raw data which had some values so huge, others so small. Putting a gap between all values

In [24]:
#The fit is working on the scalar that is why we did not call fit transform into the x_test

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

In [25]:
#checking output
pd.DataFrame(x_test) #now, values are within the range

Unnamed: 0,0,1,2,3,4
0,0.0,1.0,0.0,30.0,54000.0
1,0.0,1.0,0.0,50.0,83000.0
2,0.0,1.0,0.0,40.0,63777.777778
