# Day - 12
# Date - 06 June 2019
# Outcomes - 
>* understanding Data Munging
>* machine learning algorithms require that their input is numerical and therefore categorical features must be transformed into numerical features before we can use any of these algorithms.

### Step 1 : Processing the data before testing it

#### importing the package and reading the data

In [1]:
import pandas as pd

data_frame = pd.read_csv('purchase_data.csv')
data_frame

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,,Yes


#### dealing with missing values 

In [2]:
purchase = data_frame.interpolate()
purchase

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,37.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,59500.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,41.5,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,83000.0,Yes


#### converting the data set into numerics 

In [3]:
# converting 'purchase' data set into numeric format

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
final_purchase = purchase

#fit and transform the first column of the data, and then 
#replace the existing text data with the new encoded data
final_purchase['Country'] = label_encoder.fit_transform(purchase['Country'])
final_purchase['Purchased'] = label_encoder.fit_transform(purchase['Purchased'])

final_purchase

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,37.0,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,59500.0,1
5,0,35.0,58000.0,1
6,2,41.5,52000.0,0
7,0,48.0,79000.0,1
8,1,50.0,83000.0,0
9,0,37.0,83000.0,1


#### The problem here is, since there are different numbers in the same column, the model will misunderstand the data to be in some kind of order, 0 < 1 < 2. But this isn’t the case at all. To overcome this problem, we use One Hot Encoder.

In [4]:
# encoding the whole data set 

import warnings
from sklearn.preprocessing import OneHotEncoder

warnings.simplefilter('ignore')
hot_encoder = OneHotEncoder(categorical_features = [0])
encoded_purchase = hot_encoder.fit_transform(purchase).toarray()

In [5]:
encoded_purchase

array([[1.00e+00, 0.00e+00, 0.00e+00, 4.40e+01, 7.20e+04, 0.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 3.70e+01, 4.80e+04, 1.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 3.00e+01, 5.40e+04, 0.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 3.80e+01, 6.10e+04, 0.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 4.00e+01, 5.95e+04, 1.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 3.50e+01, 5.80e+04, 1.00e+00],
       [0.00e+00, 0.00e+00, 1.00e+00, 4.15e+01, 5.20e+04, 0.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 4.80e+01, 7.90e+04, 1.00e+00],
       [0.00e+00, 1.00e+00, 0.00e+00, 5.00e+01, 8.30e+04, 0.00e+00],
       [1.00e+00, 0.00e+00, 0.00e+00, 3.70e+01, 8.30e+04, 1.00e+00]])

In [6]:
pd.DataFrame(encoded_purchase)

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.0,0.0,44.0,72000.0,0.0
1,0.0,0.0,1.0,37.0,48000.0,1.0
2,0.0,1.0,0.0,30.0,54000.0,0.0
3,0.0,0.0,1.0,38.0,61000.0,0.0
4,0.0,1.0,0.0,40.0,59500.0,1.0
5,1.0,0.0,0.0,35.0,58000.0,1.0
6,0.0,0.0,1.0,41.5,52000.0,0.0
7,1.0,0.0,0.0,48.0,79000.0,1.0
8,0.0,1.0,0.0,50.0,83000.0,0.0
9,1.0,0.0,0.0,37.0,83000.0,1.0


#  

### Step 2 : Splitting the data - train set and test set

In [7]:
# encoded data set
purchase

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,37.0,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,59500.0,1
5,0,35.0,58000.0,1
6,2,41.5,52000.0,0
7,0,48.0,79000.0,1
8,1,50.0,83000.0,0
9,0,37.0,83000.0,1


In [8]:
# getting input variables
inputs = purchase.iloc[:,:3]
inputs

Unnamed: 0,Country,Age,Salary
0,0,44.0,72000.0
1,2,37.0,48000.0
2,1,30.0,54000.0
3,2,38.0,61000.0
4,1,40.0,59500.0
5,0,35.0,58000.0
6,2,41.5,52000.0
7,0,48.0,79000.0
8,1,50.0,83000.0
9,0,37.0,83000.0


In [9]:
# getting output variables
output = purchase['Purchased']
output

0    0
1    1
2    0
3    0
4    1
5    1
6    0
7    1
8    0
9    1
Name: Purchased, dtype: int64

In [10]:
#
from sklearn.model_selection import train_test_split

input_train, input_test, output_train, output_test = train_test_split(inputs, output, test_size = 0.2, 
                                                                      random_state = 0)
print(input_train.shape)
print(input_test.shape)
print(output_train.shape)
print(output_test.shape, end = '\n\n')
print(input_train, end = '\n\n')
print(input_test, end = '\n\n')
print(output_train, end = '\n\n')
print(output_test)

(8, 3)
(2, 3)
(8,)
(2,)

   Country   Age   Salary
4        1  40.0  59500.0
9        0  37.0  83000.0
1        2  37.0  48000.0
6        2  41.5  52000.0
7        0  48.0  79000.0
3        2  38.0  61000.0
0        0  44.0  72000.0
5        0  35.0  58000.0

   Country   Age   Salary
2        1  30.0  54000.0
8        1  50.0  83000.0

4    1
9    1
1    1
6    0
7    1
3    0
0    0
5    1
Name: Purchased, dtype: int64

2    0
8    0
Name: Purchased, dtype: int64


### Scaling of data set

In [11]:
purchase

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,37.0,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,59500.0,1
5,0,35.0,58000.0,1
6,2,41.5,52000.0,0
7,0,48.0,79000.0,1
8,1,50.0,83000.0,0
9,0,37.0,83000.0,1


In [12]:
import warnings
from sklearn.preprocessing import scale

scale_purchase = scale(purchase)
warnings.simplefilter('ignore')
scale_purchase

array([[-1.08347268,  0.68841558,  0.56314916, -1.        ],
       [ 1.32424438, -0.5315614 , -1.35395437,  1.        ],
       [ 0.12038585, -1.75153838, -0.87467848, -1.        ],
       [ 1.32424438, -0.35727897, -0.31552329, -1.        ],
       [ 0.12038585, -0.00871412, -0.43534226,  1.        ],
       [-1.08347268, -0.88012625, -0.55516123,  1.        ],
       [ 1.32424438,  0.25270952, -1.03443711, -1.        ],
       [-1.08347268,  1.38554529,  1.12230436,  1.        ],
       [ 0.12038585,  1.73411014,  1.44182161, -1.        ],
       [-1.08347268, -0.5315614 ,  1.44182161,  1.        ]])