# Data Preprocessing in Python

### Importing libraries

In [14]:
import pandas as pd
import dtale
import numpy


#### Importing 2 datasets

In [15]:
dataset = pd.read_csv('DataPreprocessing.csv')

print(dataset)

dataset2 = pd.DataFrame({
    'Region': ['France', 'Ireland', 'Germany'],
    'Age': [21.00, 49.00, 32.0],
    'Income': [125000.00, 80000.00, 34000.00],
    'Online Shopper': ['Yes', 'Yes', 'No']
})



merged_dataset = pd.merge(dataset, dataset2, on='Income', how='outer')


   Region   Age   Income Online Shopper
0   India  49.0  86400.0             No
1  Brazil  32.0  57600.0            Yes
2     USA  35.0  64800.0             No
3  Brazil  43.0  73200.0             No
4     USA  45.0      NaN            Yes
5   India  40.0  69600.0            Yes
6  Brazil   NaN  62400.0             No
7   India  53.0  94800.0            Yes
8     USA  55.0  99600.0             No
9   India  42.0  80400.0            Yes


### Spliting the dataset in 2 parts

In [16]:
X = dataset.iloc[:, :-1].values # without the last column
Y = dataset.iloc[:, -1].values # just the last column

#dtale.show(X)
print(X)

[['India' 49.0 86400.0]
 ['Brazil' 32.0 57600.0]
 ['USA' 35.0 64800.0]
 ['Brazil' 43.0 73200.0]
 ['USA' 45.0 nan]
 ['India' 40.0 69600.0]
 ['Brazil' nan 62400.0]
 ['India' 53.0 94800.0]
 ['USA' 55.0 99600.0]
 ['India' 42.0 80400.0]]


## Data cleaning

### Missing data

In [17]:
from sklearn.impute import SimpleImputer # to handle missing data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

without_missing=SimpleImputer(missing_values=numpy.nan, strategy='mean') # put the average for each empty box of the table

without_missing = without_missing.fit(X[:, 1:]) # concerning only the 2 last columns (first one is string)
X[:, 1:] = without_missing.transform(X[:, 1:]) # apply these changes on the dataset 
# maybe possible to use fit_transform function, to deepen...
    #X = fit_transform
#dtale.show(X)
print(X)

[['India' 49.0 86400.0]
 ['Brazil' 32.0 57600.0]
 ['USA' 35.0 64800.0]
 ['Brazil' 43.0 73200.0]
 ['USA' 45.0 76533.33333333333]
 ['India' 40.0 69600.0]
 ['Brazil' 43.77777777777778 62400.0]
 ['India' 53.0 94800.0]
 ['USA' 55.0 99600.0]
 ['India' 42.0 80400.0]]


In [18]:
#dtale.show(merged_dataset)

### Encoding categorical data
Difference between Label and OneHot : \
                                      - Label gives a number in int for each line (simple but the model could misunderstand the difference between numbers (priorities issues))\
                                      - OneHot encodes in binary columns (no hierarchy between column but a lot of columns if they are a lot of categories to label)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
X_encoder = OneHotEncoder()

transformer = ColumnTransformer(transformers= [('encoder', X_encoder, [0])], remainder='passthrough') #apply OneHotEncoder on the columns
                                                                                                    # 'encoder' => name of the transformation
                                                                                                    # [0] => transform only the first column (only the name of the country)
                                                                                                    # remainder='passthrough' => don't modify the other columns
X = transformer.fit_transform(X)
print(X)


[[0.0 1.0 0.0 49.0 86400.0]
 [1.0 0.0 0.0 32.0 57600.0]
 [0.0 0.0 1.0 35.0 64800.0]
 [1.0 0.0 0.0 43.0 73200.0]
 [0.0 0.0 1.0 45.0 76533.33333333333]
 [0.0 1.0 0.0 40.0 69600.0]
 [1.0 0.0 0.0 43.77777777777778 62400.0]
 [0.0 1.0 0.0 53.0 94800.0]
 [0.0 0.0 1.0 55.0 99600.0]
 [0.0 1.0 0.0 42.0 80400.0]]


not necessary to use OneHotEncoder for Y because that's just Yes or No

In [20]:
Y_encoder = LabelEncoder()
Y=Y_encoder.fit_transform(Y)
print(Y)

[0 1 0 0 1 1 0 1 0 1]


### Splitting into training set and test set
We are now splitting the dataset
The train set that has the full data to train and the test set which has only 3 columns for testing on smaller samples

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
print(X_train)

[[0.0 1.0 0.0 42.0 80400.0]
 [1.0 0.0 0.0 32.0 57600.0]
 [1.0 0.0 0.0 43.77777777777778 62400.0]
 [0.0 1.0 0.0 53.0 94800.0]
 [1.0 0.0 0.0 43.0 73200.0]
 [0.0 1.0 0.0 49.0 86400.0]
 [0.0 1.0 0.0 40.0 69600.0]]


### Feature Scaling
To have the same scale between each columns (for instance, Income is way bigger than Age and the model could misinterpret it and gives more importance to the Income column)

In [22]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("X_train :")
print(X_train)
print("\n")
print("X_test :")
print(X_test)

X_train :
[[-0.8660254   0.8660254   0.         -0.2029809   0.44897083]
 [ 1.15470054 -1.15470054  0.         -1.82168936 -1.41706417]
 [ 1.15470054 -1.15470054  0.          0.08478949 -1.0242147 ]
 [-0.8660254   0.8660254   0.          1.5775984   1.62751925]
 [ 1.15470054 -1.15470054  0.         -0.04111006 -0.14030338]
 [-0.8660254   0.8660254   0.          0.93011502  0.94003267]
 [-0.8660254   0.8660254   0.         -0.52672259 -0.43494049]]


X_test :
[[-0.8660254  -1.15470054  1.         -1.33607682 -0.82778996]
 [-0.8660254  -1.15470054  1.          1.90134009  2.02036872]
 [-0.8660254  -1.15470054  1.          0.28263164  0.13250875]]


Preparing data for LSTM DL

In [25]:
X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
