# Data Preprocessing Tools

### Libraries

In [22]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Import data set with pandas

In [2]:
dataset = pd.read_csv('C:/Users/Sebas/Desktop/DataSets/DataSetsPython/data1.csv')
x = dataset.iloc[:,:-1].values   #independent variables
y = dataset.iloc[:,-1].values    #Dependent variable
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


The "iloc" command stands for index location, and ":" idicates range. Range includes lower limit whereas excludes the upper limit. The "-1" stands for the last column, it is excluded in the x dataset and included exclusively in the Y dataset.

### Solution for Missing Data

In [3]:
missing_values = dataset.isnull().sum()
print(missing_values) #Number of missing values on each column

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64


In [4]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') #object/strategy to deal with missing values
imputer.fit(x[:,1:3])    #"fit" method use to apply the object to the columns specified
x[:,1:3]=imputer.transform(x[:,1:3]) #"transform" method to update our columns
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


Instead of deleting the rows containing missing values, we simply apply an object that replaces the missing value for the average value from each column.

### Categorical Data

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],
                       remainder='passthrough')
x = np.array(ct.fit_transform(x))

Independent variable: First, we change the 3 countries into categorical data. The "country" column was transform into 3 new columns, each one takes the value 1 for its respective country. 
Usefull to directly transform a multi-categorical label. 

In [6]:
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y=le.fit_transform(y)

Dependent variable: We changed the "yes/no" column for a "1/0" column. 
Usefull to directly enncode a binary outcome from a two classes label.

In [8]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


Lets take another example

In [9]:
# Importing the necessary libraries
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import pandas as pd 
import numpy as np 
# Load the dataset
titanic_dataset = pd.read_csv('C:/Users/Sebas/Desktop/DataSets/DataSetsPython/titanic.csv')
titanic_dataset


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [10]:
# Identify the categorical data
categorical_features = ['Sex', 'Embarked', 'Pclass']
# Implement an instance of the ColumnTransformer class
ct = ColumnTransformer( transformers=[ ('encoder', OneHotEncoder(), categorical_features) ],remainder='passthrough' )
# Apply the fit_transform method on the instance of ColumnTransformer
X = ct.fit_transform(titanic_dataset)
# Convert the output into a NumPy array
X = np.array(X)
# Use LabelEncoder to encode binary categorical data
le = LabelEncoder()
Y = le.fit_transform(titanic_dataset['Survived'])
# Print the updated matrix of features and the dependent variable vector
print(X)

[[0.0 1.0 0.0 ... 'A/5 21171' 7.25 nan]
 [1.0 0.0 1.0 ... 'PC 17599' 71.2833 'C85']
 [1.0 0.0 0.0 ... 'STON/O2. 3101282' 7.925 nan]
 ...
 [1.0 0.0 0.0 ... 'W./C. 6607' 23.45 nan]
 [0.0 1.0 1.0 ... '111369' 30.0 'C148']
 [0.0 1.0 0.0 ... '370376' 7.75 nan]]


In [11]:
print(Y)

[0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 0 1
 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0
 1 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0
 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1
 0 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 1 0 1 0 1 0
 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1
 1 0 1 0 0 0 0 0 1 1 1 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 0 0 0 0
 0 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0
 0 1 0 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 1 1
 1 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 1 1 1 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0
 1 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 1 1 1 0 0 1 0 1 0 0 1 0 0 1
 1 1 1 1 1 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0
 0 0 1 1 0 1 0 0 1 0 0 0 

# Splitting Data

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
X_train, X_test, y_train, y_test 

(array([[0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
        [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
        [1.0, 0.0, 0.0, 44.0, 72000.0],
        [0.0, 0.0, 1.0, 38.0, 61000.0],
        [0.0, 0.0, 1.0, 27.0, 48000.0],
        [1.0, 0.0, 0.0, 48.0, 79000.0],
        [0.0, 1.0, 0.0, 50.0, 83000.0],
        [1.0, 0.0, 0.0, 35.0, 58000.0]], dtype=object),
 array([[0.0, 1.0, 0.0, 30.0, 54000.0],
        [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object),
 array([0, 1, 0, 0, 1, 1, 0, 1]),
 array([0, 1]))

In [13]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [14]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [15]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [16]:
print(y_test)

[0 1]


# Feature Scalling 

In [17]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

We used the same scaler from the feature training set for the feature test set. We can't make a new one, otherwise the model would be different.

Also, it is worth mentioning that we didn't apply feature scaling to the dependent variable because it already had the values 0 or 1.

In [18]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [19]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]


Another example

In [20]:
wine_dataset = pd.read_csv('C:/Users/Sebas/Desktop/DataSets/DataSetsPython/wine.csv') # watch out for delimiters (delimiter=';')
# Separate features and target
wx = wine_dataset.iloc[:,:-1].values   
wy = wine_dataset.iloc[:,-1].values   

# Split the dataset into an 80-20 training-test set
wX_train, wX_test, wy_train, wy_test = train_test_split(wx, wy, test_size=0.2, random_state=42)

wine_dataset

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcanity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280,Proline,Customer_Segment
0,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050,1
2,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740,3
174,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750,3
175,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835,3
176,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840,3


In [21]:
# Create an instance of the StandardScaler class
sc = StandardScaler()

# Fit the StandardScaler on the features from the training set and transform it
wX_train = sc.fit_transform(wX_train)

# Apply the transform to the test set
wX_test = sc.transform(wX_test)

# Print the scaled training and test datasets
print(wX_train)
print(wX_test)

[[ 1.66529275 -0.60840587  1.21896194 ... -1.65632857 -0.87940904
  -0.24860607]
 [-0.54952506  2.7515415   1.00331502 ... -0.58463272 -1.25462095
  -0.72992237]
 [-0.74531007 -1.14354109 -0.93750727 ...  0.35845962  0.2462267
  -0.24860607]
 ...
 [ 1.714239   -0.44172441  0.06884503 ...  1.04434496  0.56585166
   2.69572196]
 [-0.35374006 -0.7399965  -0.36244882 ...  0.01551695 -0.74044166
  -0.79631083]
 [-0.78201975  0.06709269  0.35637426 ... -0.67036839  1.09392769
  -0.98551793]]
[[ 8.08733375e-01  6.37318741e-01  7.15785791e-01 -1.24128036e+00
   1.06556722e+00  6.46636689e-01  1.02724237e+00 -1.54932094e+00
   8.93605295e-02  1.82522280e-02  1.55169482e-02  1.06613421e+00
   3.65487151e-01]
 [ 1.50621744e+00  1.46195334e+00  2.84491948e-01 -1.66513218e-01
   7.23080697e-01  8.82684015e-01  6.47480801e-01 -5.32234717e-01
  -6.15594759e-01  7.85270273e-02 -3.70293555e-01  1.02444400e+00
   1.14555151e+00]
 [-6.00625639e-02  3.82910194e-01  1.21896194e+00  4.43489751e-01
  -3.0437