# Airline Dataset Preprocessing

## TODO 

* [ ] Feature descriptions
* [ ] Column types
* [ ] Missing Data
* [ ] Distribution
* [ ] Correlations

## Import libraries

In [62]:
import os
import logging
from datetime import datetime
_ABSOLUTE_DIR = os.getcwd()

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

## Load Data

In [63]:
dfAirline_train = pd.read_csv(f'{_ABSOLUTE_DIR}/../artifacts/data/airline/train.csv')
dfAirline_test = pd.read_csv(f'{_ABSOLUTE_DIR}/../artifacts/data/airline/test.csv')

In [64]:
dfAirline_train

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,103899,94171,Female,disloyal Customer,23,Business travel,Eco,192,2,1,...,2,3,1,4,2,3,2,3,0.0,neutral or dissatisfied
103900,103900,73097,Male,Loyal Customer,49,Business travel,Business,2347,4,4,...,5,5,5,5,5,5,4,0,0.0,satisfied
103901,103901,68825,Male,disloyal Customer,30,Business travel,Business,1995,1,1,...,4,3,2,4,5,5,4,7,14.0,neutral or dissatisfied
103902,103902,54173,Female,disloyal Customer,22,Business travel,Eco,1000,1,1,...,1,4,5,1,5,4,1,0,0.0,neutral or dissatisfied


In [65]:
dfAirline_test

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,...,5,5,5,5,2,5,5,50,44.0,satisfied
1,1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,...,4,4,4,4,3,4,5,0,0.0,satisfied
2,2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,...,2,4,1,3,2,2,2,0,0.0,neutral or dissatisfied
3,3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,...,1,1,1,1,3,1,4,0,6.0,satisfied
4,4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,...,2,2,2,2,4,2,4,0,20.0,satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25971,25971,78463,Male,disloyal Customer,34,Business travel,Business,526,3,3,...,4,3,2,4,4,5,4,0,0.0,neutral or dissatisfied
25972,25972,71167,Male,Loyal Customer,23,Business travel,Business,646,4,4,...,4,4,5,5,5,5,4,0,0.0,satisfied
25973,25973,37675,Female,Loyal Customer,17,Personal Travel,Eco,828,2,5,...,2,4,3,4,5,4,2,0,0.0,neutral or dissatisfied
25974,25974,90086,Male,Loyal Customer,14,Business travel,Business,1127,3,3,...,4,3,2,5,4,5,4,0,0.0,satisfied


In [66]:
dfAirline_train_v1 = dfAirline_train.copy()
dfAirline_test_v1 = dfAirline_test.copy()

As we can see, the first column is unnecessary, because it only gives the data row number, and we already have that information. Because of that, the row called *Unnamed:0* was deleted.

In [67]:
del dfAirline_train_v1['Unnamed: 0']

In [68]:
del dfAirline_test_v1['Unnamed: 0']

## Missing data

In [69]:
dfAirline_train_v1.isna().sum()

id                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             310
satisfaction                           0
dtype: int64

In [70]:
dfAirline_test_v1.isna().sum()

id                                    0
Gender                                0
Customer Type                         0
Age                                   0
Type of Travel                        0
Class                                 0
Flight Distance                       0
Inflight wifi service                 0
Departure/Arrival time convenient     0
Ease of Online booking                0
Gate location                         0
Food and drink                        0
Online boarding                       0
Seat comfort                          0
Inflight entertainment                0
On-board service                      0
Leg room service                      0
Baggage handling                      0
Checkin service                       0
Inflight service                      0
Cleanliness                           0
Departure Delay in Minutes            0
Arrival Delay in Minutes             83
satisfaction                          0
dtype: int64

## Column types

In [38]:
dfAirline_train_v1.dtypes

id                                     int64
Gender                                object
Customer Type                         object
Age                                    int64
Type of Travel                        object
Class                                 object
Flight Distance                        int64
Inflight wifi service                  int64
Departure/Arrival time convenient      int64
Ease of Online booking                 int64
Gate location                          int64
Food and drink                         int64
Online boarding                        int64
Seat comfort                           int64
Inflight entertainment                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Inflight service                       int64
Cleanliness                            int64
Departure Delay in Minutes             int64
Arrival De

Most of the features 

### Gender

In [39]:
dfAirline_train_v1.Gender.unique()

array(['Male', 'Female'], dtype=object)

In [40]:
dfAirline_train_v1['Gender'] = dfAirline_train_v1['Gender'].map({'Female':True, 'Male':False})

In [41]:
dfAirline_test_v1['Gender'] = dfAirline_test_v1['Gender'].map({'Female':True, 'Male':False})

### Customer Type

In [44]:
dfAirline_train_v1['Customer Type'].unique()

array(['Loyal Customer', 'disloyal Customer'], dtype=object)

In [45]:
dfAirline_train_v1['Customer Type'] = dfAirline_train_v1['Customer Type'].map({'Loyal Customer':True, 'disloyal Customer':False})
dfAirline_test_v1['Customer Type'] = dfAirline_test_v1['Customer Type'].map({'Loyal Customer':True, 'disloyal Customer':False})

### Type of Travel

In [46]:
dfAirline_train_v1['Type of Travel'].unique()

array(['Personal Travel', 'Business travel'], dtype=object)

In [47]:
dfAirline_train_v1['Type of Travel'] = dfAirline_train_v1['Type of Travel'].map({'Personal Travel':True, 'Business travel':False})
dfAirline_test_v1['Type of Travel'] = dfAirline_test_v1['Type of Travel'].map({'Personal Travel':True, 'Business travel':False})

### Class

In [48]:
dfAirline_train_v1['Class'].unique()

array(['Eco Plus', 'Business', 'Eco'], dtype=object)

In [56]:
dfAirline_train_v1 = pd.get_dummies(dfAirline_train_v1, columns = ['Class'])

In [57]:
dfAirline_test_v1 = pd.get_dummies(dfAirline_test_v1, columns = ['Class'])

### Satisfaction

In [49]:
dfAirline_train_v1['satisfaction'].unique()

array(['neutral or dissatisfied', 'satisfied'], dtype=object)

In [50]:
dfAirline_train_v1['satisfaction'] = dfAirline_train_v1['satisfaction'].map({'satisfied':True, 'neutral or dissatisfied':False})
dfAirline_test_v1['satisfaction'] = dfAirline_test_v1['satisfaction'].map({'satisfied':True, 'neutral or dissatisfied':False})

### Types and Dataset after the transformation

In [58]:
dfAirline_train_v1.dtypes

id                                     int64
Gender                                  bool
Customer Type                           bool
Age                                    int64
Type of Travel                          bool
Flight Distance                        int64
Inflight wifi service                  int64
Departure/Arrival time convenient      int64
Ease of Online booking                 int64
Gate location                          int64
Food and drink                         int64
Online boarding                        int64
Seat comfort                           int64
Inflight entertainment                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Inflight service                       int64
Cleanliness                            int64
Departure Delay in Minutes             int64
Arrival Delay in Minutes             float64
satisfacti

In [59]:
dfAirline_train_v1

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction,Class_Business,Class_Eco,Class_Eco Plus
0,70172,False,True,13,True,460,3,4,3,1,...,4,4,5,5,25,18.0,False,0,0,1
1,5047,False,False,25,False,235,3,2,3,3,...,3,1,4,1,1,6.0,False,1,0,0
2,110028,True,True,26,False,1142,2,2,2,2,...,4,4,4,5,0,0.0,True,1,0,0
3,24026,True,True,25,False,562,2,5,5,5,...,3,1,4,2,11,9.0,False,1,0,0
4,119299,False,True,61,False,214,3,3,3,3,...,4,3,3,3,0,0.0,True,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103899,94171,True,False,23,False,192,2,1,2,3,...,4,2,3,2,3,0.0,False,0,1,0
103900,73097,False,True,49,False,2347,4,4,4,4,...,5,5,5,4,0,0.0,True,1,0,0
103901,68825,False,False,30,False,1995,1,1,1,3,...,4,5,5,4,7,14.0,False,1,0,0
103902,54173,True,False,22,False,1000,1,1,1,5,...,1,5,4,1,0,0.0,False,0,1,0
