## Import

In [19]:
# Import of basic packages
import numpy as np
import pandas as pd
import operator
import warnings

warnings.filterwarnings('ignore')

# Import of chart packages
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import altair as alt

# Import of machine learning metric packages
from sklearn.metrics import f1_score, classification_report, confusion_matrix, mean_squared_error, r2_score, accuracy_score, recall_score, precision_score, roc_auc_score, roc_curve, fbeta_score
from sklearn import metrics

# Import of preprossesor packages
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, LabelBinarizer, PolynomialFeatures

# Import of machine learning packages
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingRegressor, StackingClassifier, AdaBoostClassifier
#from xgboost import XGBClassifier
from sklearn.svm import SVC

# Set random seed 
RSEED = 0

## Load

In [20]:
# Load csv
df = pd.read_csv('data/train.csv')
df.head(3)

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target
0,train_id_0,2016-01-03,TU 0712,CMN,TUN,2016-01-03 10:30:00,2016-01-03 12.55.00,ATA,TU 32AIMN,260.0
1,train_id_1,2016-01-13,TU 0757,MXP,TUN,2016-01-13 15:05:00,2016-01-13 16.55.00,ATA,TU 31BIMO,20.0
2,train_id_2,2016-01-16,TU 0214,TUN,IST,2016-01-16 04:10:00,2016-01-16 06.45.00,ATA,TU 32AIMN,0.0


## Explore data analysis

In [28]:
# EDA info & describe
info = pd.concat([
df.dtypes.to_frame().T,
df.mean(numeric_only=True).to_frame().T,
df.std(numeric_only=True).to_frame().T,
df.min(numeric_only=True).to_frame().T,
df.quantile(0.25, numeric_only=True).to_frame().T,
df.quantile(0.5, numeric_only=True).to_frame().T, 
df.quantile(0.75, numeric_only=True).to_frame().T,
df.max(numeric_only=True).to_frame().T,], ignore_index=True).applymap(lambda x: round(x, 1) if isinstance(x, (int, float)) else x)

info.insert(0, 'statistic', ['dtype', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])
info

Unnamed: 0,statistic,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC
0,dtype,object,object,object,object,object,object,object,object,object
1,mean,,,,,,,,,
2,std,,,,,,,,,
3,min,,,,,,,,,
4,25%,,,,,,,,,
5,50%,,,,,,,,,
6,75%,,,,,,,,,
7,max,,,,,,,,,


In [22]:
# EDA duplicates
duplicates = df.duplicated().sum()
duplicate_percentage = round((duplicates / df.shape[0]) * 100, 1)
df[df.duplicated(keep=False)].head(10)

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC,target


In [25]:
# EDA NaNs
missing = pd.DataFrame(df.isnull().sum(), columns=['Amount'])
missing['Percentage'] = round((missing['Amount']/df.shape[0]) * 100, 1)
missing[missing['Amount'] != 0]

Unnamed: 0,Amount,Percentage


In [29]:
# EDA shape
print('Number of rows and columns: ',df.shape)
print('-'*50)
pd.concat([df.head(3), df.tail(3)]).reset_index(drop=True)

Number of rows and columns:  (9333, 9)
--------------------------------------------------


Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC
0,test_id_0,2016-05-04,TU 0700,DJE,TUN,2016-05-04 06:40:00,2016-05-04 07.30.00,ATA,TU 32AIMF
1,test_id_1,2016-05-05,TU 0395,TUN,BKO,2016-05-05 15:20:00,2016-05-05 20.05.00,ATA,TU 320IMW
2,test_id_2,2016-05-06,TU 0745,FRA,TUN,2016-05-06 10:00:00,2016-05-06 12.25.00,ATA,TU 32AIMC
3,test_id_9330,2018-09-03,UG 1601,SJJ,TUN,2018-09-03 09:20:00,2018-09-03 11.10.00,SCH,TU CR9ISA
4,test_id_9331,2018-09-15,UG 0008,TUN,DJE,2018-09-15 14:30:00,2018-09-15 15.30.00,SCH,UG AT7LBD
5,test_id_9332,2018-09-19,UG 0009,DJE,TUN,2018-09-19 16:15:00,2018-09-19 17.15.00,SCH,UG AT7LBD


In [30]:
# EDA Uniques
unique_counts = pd.DataFrame(df.nunique(), columns=['Amount']).sort_values('Amount', ascending=False).T
unique_counts

Unnamed: 0,ID,STA,STD,FLTID,DATOP,ARRSTN,DEPSTN,AC,STATUS
Amount,9333,7290,7012,700,86,84,82,44,5


In [31]:
df.head(10)

Unnamed: 0,ID,DATOP,FLTID,DEPSTN,ARRSTN,STD,STA,STATUS,AC
0,test_id_0,2016-05-04,TU 0700,DJE,TUN,2016-05-04 06:40:00,2016-05-04 07.30.00,ATA,TU 32AIMF
1,test_id_1,2016-05-05,TU 0395,TUN,BKO,2016-05-05 15:20:00,2016-05-05 20.05.00,ATA,TU 320IMW
2,test_id_2,2016-05-06,TU 0745,FRA,TUN,2016-05-06 10:00:00,2016-05-06 12.25.00,ATA,TU 32AIMC
3,test_id_3,2016-05-11,TU 0848,BEY,TUN,2016-05-11 09:40:00,2016-05-11 13.10.00,ATA,TU 31BIMO
4,test_id_4,2016-05-11,TU 0635,ORY,MIR,2016-05-11 09:50:00,2016-05-11 12.35.00,ATA,TU 736IOQ
5,test_id_5,2016-05-11,UG 1313,CDG,SFA,2016-05-11 11:10:00,2016-05-11 12.40.00,DEP,TU CR9ISA
6,test_id_6,2016-05-11,TU 0515,BCN,TUN,2016-05-11 10:40:00,2016-05-11 12.15.00,ATA,TU 32AIMF
7,test_id_7,2016-05-11,TU 0712,CMN,TUN,2016-05-11 10:35:00,2016-05-11 13.00.00,ATA,TU 32AIMN
8,test_id_8,2016-05-11,TU 0751,LYS,TUN,2016-05-11 10:40:00,2016-05-11 12.35.00,ATA,TU 320IMS
9,test_id_9,2016-05-11,TU 0745,FRA,TUN,2016-05-11 10:30:00,2016-05-11 12.55.00,ATA,TU 32AIMC


## Data cleaning

In [36]:
# Clean column names
df.columns = df.columns.str.replace(' ','_')
df.columns = df.columns.str.lower()

Feature 'ac' holds information about the model of the airplane. Extracting and converting the information.

In [38]:
df['model'] = df['ac'].str[3:6]
df.head()

Unnamed: 0,id,datop,fltid,depstn,arrstn,std,sta,status,ac,model
0,test_id_0,2016-05-04,TU 0700,DJE,TUN,2016-05-04 06:40:00,2016-05-04 07.30.00,ATA,TU 32AIMF,32A
1,test_id_1,2016-05-05,TU 0395,TUN,BKO,2016-05-05 15:20:00,2016-05-05 20.05.00,ATA,TU 320IMW,320
2,test_id_2,2016-05-06,TU 0745,FRA,TUN,2016-05-06 10:00:00,2016-05-06 12.25.00,ATA,TU 32AIMC,32A
3,test_id_3,2016-05-11,TU 0848,BEY,TUN,2016-05-11 09:40:00,2016-05-11 13.10.00,ATA,TU 31BIMO,31B
4,test_id_4,2016-05-11,TU 0635,ORY,MIR,2016-05-11 09:50:00,2016-05-11 12.35.00,ATA,TU 736IOQ,736


In [39]:
# EDA Uniques
unique_counts = pd.DataFrame(df.nunique(), columns=['Amount']).sort_values('Amount', ascending=False).T
unique_counts

Unnamed: 0,id,sta,std,fltid,datop,arrstn,depstn,ac,model,status
Amount,9333,7290,7012,700,86,84,82,44,13,5


In [42]:
set(df['model'])

{'31A',
 '31B',
 '320',
 '321',
 '32A',
 '332',
 '343',
 '345',
 '733',
 '734',
 '736',
 'AT7',
 'CR9'}

In [43]:
replacement_dict = {
    '31A': 'Airbus',
    '31B': 'Airbus',
    '320': 'Airbus',
    '321': 'Airbus',
    '32A': 'Airbus',
    '332': 'Airbus',
    '343': 'Airbus',
    '345': 'Airbus',
    '733': 'Boing',
    '734': 'Boing',
    '736': 'Boing',
    'AT7': 'ATR',
    'CR9': 'Bombardier'
}

df['producer'] = df['model']

# Replace values in the 'purpose' column
df['producer'] = df['producer'].replace(replacement_dict)


In [44]:
df['model'] = df['ac'].str[3:6]
df.head()

Unnamed: 0,id,datop,fltid,depstn,arrstn,std,sta,status,ac,model,producer
0,test_id_0,2016-05-04,TU 0700,DJE,TUN,2016-05-04 06:40:00,2016-05-04 07.30.00,ATA,TU 32AIMF,32A,Airbus
1,test_id_1,2016-05-05,TU 0395,TUN,BKO,2016-05-05 15:20:00,2016-05-05 20.05.00,ATA,TU 320IMW,320,Airbus
2,test_id_2,2016-05-06,TU 0745,FRA,TUN,2016-05-06 10:00:00,2016-05-06 12.25.00,ATA,TU 32AIMC,32A,Airbus
3,test_id_3,2016-05-11,TU 0848,BEY,TUN,2016-05-11 09:40:00,2016-05-11 13.10.00,ATA,TU 31BIMO,31B,Airbus
4,test_id_4,2016-05-11,TU 0635,ORY,MIR,2016-05-11 09:50:00,2016-05-11 12.35.00,ATA,TU 736IOQ,736,Boing


## Feature engineering

## Splitting data for testing 

## Trainining the model

## Evaluation

## Validation

## Packaging