# Feature Encoding

In [1]:
import pandas as pd
import pyodbc 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re

import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv('D:/KAGGLE COMPETITIONS/Titanic/4. Analysis/train_after_cleaning_feature_engineering.csv')

In [3]:
 sns.set(rc={'figure.figsize':(11.7,8.27)})  ### setting seaborn style

* Let us check our data

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890 entries, 0 to 889
Data columns (total 9 columns):
Survived    890 non-null int64
Pclass      890 non-null int64
Sex         890 non-null object
Age         890 non-null float64
SibSp       890 non-null int64
Parch       890 non-null int64
Fare        890 non-null float64
Embarked    890 non-null object
Deck        890 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 62.7+ KB


In [5]:
df.head(20)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck
0,0,3,male,22.0,1,0,7.25,S,U
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,U
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,U
5,0,3,male,29.628569,0,0,8.4583,Q,U
6,0,1,male,54.0,0,0,51.8625,S,E
7,0,3,child,2.0,3,1,21.075,S,U
8,1,3,female,27.0,0,2,11.1333,S,U
9,1,2,child,14.0,1,0,30.0708,C,U


* We will split on data and labels by extracting the Survived column

In [6]:
labels=df['Survived']
data=df.drop(['Survived'],axis=1)

* We will check the data and the labels

### Data

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890 entries, 0 to 889
Data columns (total 8 columns):
Pclass      890 non-null int64
Sex         890 non-null object
Age         890 non-null float64
SibSp       890 non-null int64
Parch       890 non-null int64
Fare        890 non-null float64
Embarked    890 non-null object
Deck        890 non-null object
dtypes: float64(2), int64(3), object(3)
memory usage: 55.8+ KB


In [8]:
data.head(20)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck
0,3,male,22.0,1,0,7.25,S,U
1,1,female,38.0,1,0,71.2833,C,C
2,3,female,26.0,0,0,7.925,S,U
3,1,female,35.0,1,0,53.1,S,C
4,3,male,35.0,0,0,8.05,S,U
5,3,male,29.628569,0,0,8.4583,Q,U
6,1,male,54.0,0,0,51.8625,S,E
7,3,child,2.0,3,1,21.075,S,U
8,3,female,27.0,0,2,11.1333,S,U
9,2,child,14.0,1,0,30.0708,C,U


### Labels

In [9]:
print(labels)

0      0
1      1
2      1
3      1
4      0
      ..
885    0
886    1
887    0
888    1
889    0
Name: Survived, Length: 890, dtype: int64


# Splitting on Numerical and Categorical Features

In [10]:
numericals=data[['Pclass','Age','SibSp','Parch','Fare']]
categoricals=data[['Sex','Embarked','Deck']]

### Getting dummy encoding for categoricals

In [11]:
categorical_dummies=pd.get_dummies(categoricals,drop_first=True)

print(categorical_dummies.head(20))

    Sex_female  Sex_male  Embarked_Q  Embarked_S  Deck_B  Deck_C  Deck_D  \
0            0         1           0           1       0       0       0   
1            1         0           0           0       0       1       0   
2            1         0           0           1       0       0       0   
3            1         0           0           1       0       1       0   
4            0         1           0           1       0       0       0   
5            0         1           1           0       0       0       0   
6            0         1           0           1       0       0       0   
7            0         0           0           1       0       0       0   
8            1         0           0           1       0       0       0   
9            0         0           0           0       0       0       0   
10           0         0           0           1       0       0       0   
11           1         0           0           1       0       1       0   
12          

### MinMaxScaling numerical features

In [12]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler

minmax_scaler=MinMaxScaler(feature_range=(0, 1))
#standard_scaler=StandardScaler()

scaled_numericals_array=minmax_scaler.fit_transform(numericals)

scaled_numericals=pd.DataFrame(scaled_numericals_array,columns=list(numericals.columns))

* Let us now check

In [13]:
scaled_numericals

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,1.0,0.293286,0.125,0.000000,0.014151
1,0.0,0.510737,0.125,0.000000,0.139136
2,1.0,0.347649,0.000,0.000000,0.015469
3,0.0,0.469965,0.125,0.000000,0.103644
4,1.0,0.469965,0.000,0.000000,0.015713
...,...,...,...,...,...
885,0.5,0.361239,0.000,0.000000,0.025374
886,0.0,0.252514,0.000,0.000000,0.058556
887,1.0,0.396963,0.125,0.333333,0.045771
888,0.0,0.347649,0.000,0.000000,0.058556


### Combining the two sets

In [14]:
scaled_data=pd.DataFrame(np.hstack((scaled_numericals,categorical_dummies)),columns=list(scaled_numericals.columns)+list(categorical_dummies.columns))

* Let us check one more time

In [15]:
print(scaled_data)

     Pclass       Age  SibSp     Parch      Fare  Sex_female  Sex_male  \
0       1.0  0.293286  0.125  0.000000  0.014151         0.0       1.0   
1       0.0  0.510737  0.125  0.000000  0.139136         1.0       0.0   
2       1.0  0.347649  0.000  0.000000  0.015469         1.0       0.0   
3       0.0  0.469965  0.125  0.000000  0.103644         1.0       0.0   
4       1.0  0.469965  0.000  0.000000  0.015713         0.0       1.0   
..      ...       ...    ...       ...       ...         ...       ...   
885     0.5  0.361239  0.000  0.000000  0.025374         0.0       1.0   
886     0.0  0.252514  0.000  0.000000  0.058556         1.0       0.0   
887     1.0  0.396963  0.125  0.333333  0.045771         1.0       0.0   
888     0.0  0.347649  0.000  0.000000  0.058556         0.0       1.0   
889     1.0  0.429193  0.000  0.000000  0.015127         0.0       1.0   

     Embarked_Q  Embarked_S  Deck_B  Deck_C  Deck_D  Deck_E  Deck_F  Deck_G  \
0           0.0         1.0     

### Sending to Analysis File ( data and labels )

In [16]:
scaled_data.to_csv(r'D:\KAGGLE COMPETITIONS\Titanic\4. Analysis\train_after_feature_encoding.csv', index = False)
labels.to_csv(r'D:\KAGGLE COMPETITIONS\Titanic\4. Analysis\labels.csv')