In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# Import Libraries

In [None]:
import numpy as np
import pandas as pd


from scipy import stats 
from scipy.interpolate import make_interp_spline
import seaborn as sns
import matplotlib.pyplot as plt


#Models
from sklearn.ensemble import BaggingClassifier
from sklearn import tree
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB

#Evaluation Models
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import plot_confusion_matrix

import time

import warnings
warnings.filterwarnings("ignore")

In [None]:
pip install imbalanced-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# **Load Data**

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/one_hot_encoded_dataset(classification).csv')
dataset.head()

Unnamed: 0,num_code,dep_time,arr_time,type,TicketCategory,flight_month,num_of_hours_taken,num_of_minutes_taken,num_of_stops,distance_between_2_cities,...,week_day_of_flight_Sunday,week_day_of_flight_Tuesday,week_day_of_flight_Wednesday,departure_time_of_the_day_Afternoon,departure_time_of_the_day_Early morning,departure_time_of_the_day_Evening,departure_time_of_the_day_Morning,arrival_time_of_the_day_Afternoon,arrival_time_of_the_day_Early morning,arrival_time_of_the_day_Evening
0,764,4,6,0,1,2,2,6.0,0,0.653497,...,0,0,0,0,1,0,0,0,0,0
1,764,4,14,0,1,2,10,12.0,1,0.985014,...,0,0,0,0,1,0,0,1,0,0
2,548,4,16,0,1,2,11,18.0,1,0.985014,...,0,0,0,0,1,0,0,1,0,0
3,548,4,12,0,1,2,7,21.0,1,0.985014,...,0,0,0,0,1,0,0,1,0,0
4,548,4,19,0,1,2,14,12.0,1,0.985014,...,0,0,0,0,1,0,0,0,0,1


# **Spliting the Dataset**

In [None]:
y_train = dataset['TicketCategory'][:192167]
y_test = dataset['TicketCategory'][192167:]

dataset.drop(columns=['TicketCategory'],axis=1,inplace=True)

x_train = dataset[:192167]
x_test = dataset[192167:]

**KNN Imputer**

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer()
imputer.fit_transform(x_train)

array([[764.,   4.,   6., ...,   0.,   0.,   0.],
       [764.,   4.,  14., ...,   1.,   0.,   0.],
       [548.,   4.,  16., ...,   1.,   0.,   0.],
       ...,
       [853.,  16.,  17., ...,   1.,   0.,   0.],
       [133.,  16.,  22., ...,   0.,   0.,   1.],
       [687.,  16.,   8., ...,   0.,   0.,   0.]])

In [None]:
y_train.value_counts()

1    89872
0    46013
3    38311
2    17971
Name: TicketCategory, dtype: int64

**Imbalanced Learn**

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=27)
x_train, y_train = sm.fit_resample(x_train, y_train)

In [None]:
y_train.value_counts()

1    89872
2    89872
0    89872
3    89872
Name: TicketCategory, dtype: int64

In [None]:
# def compare_orgin_predict(y_test,y_pred,N=40):
#   x_ax = range(len(y_test[:N]))
#   plt.plot(x_ax, y_test[:N], label="original")
#   plt.plot(x_ax, y_pred[:N], label="predicted")
#   plt.title("Flight Price test and predicted data")
#   plt.xlabel('Observation Number')
#   plt.ylabel('TicketCategory')
#   plt.legend()
#   plt.show()

# **Modeling**

In [None]:
models = {
    "GB-Classifier"      : GradientBoostingClassifier(learning_rate=0.01),
    "BaggingClassifier"  : BaggingClassifier(tree.DecisionTreeClassifier(random_state=1)),
    "RF-Classifier"      : RandomForestClassifier(n_estimators=100),
    "XGB-Classifier"     : XGBClassifier(objective= 'binary:logistic', learning_rate=0.01, n_estimators=100),
    #"KNN"                : KNeighborsClassifier(n_neighbors = 30, weights = 'distance'),
    "DT-Classifier"      : DecisionTreeClassifier(),
    "AdaBoostClassifier" : AdaBoostClassifier(DecisionTreeClassifier(),algorithm="SAMME",n_estimators=100),
    "HGBoosting"         : HistGradientBoostingClassifier(),
    "ETClassifier"       : ExtraTreesClassifier(n_estimators=100)
}


In [None]:
for name, model in models.items():
    
    start = time.process_time()
    print(f'Using model: {name}')
    model.fit(x_train, y_train)
    print(f'Time: {time.process_time() - start}')

    print(f'Training Score: {model.score(x_train, y_train)}')
    print(f'Test Score: {model.score(x_test, y_test)}') 
    
    y_pred = model.predict(x_test)
    
    print(f'confusion_matrix:\n {confusion_matrix(y_test, y_pred)}')
    print(f'accuracy_score: {accuracy_score(y_test, y_pred)}')
    print(f'precision_score: {precision_score(y_test, y_pred,average="weighted")}')
    print(f'recall_score: {recall_score(y_test, y_pred,average="weighted")}')
    print(f'f1_score: {recall_score(y_test, y_pred,average="weighted")}')

    print('-'*40)

    print(classification_report(y_test, y_pred))

    print('-'*80)
    # compare_orgin_predict(y_test,y_pred)
    print('-'*80)

Using model: GB-Classifier
Time: 279.78395061699985
Training Score: 0.7142658447569877
Test Score: 0.6652653927813164
confusion_matrix:
 [[18893   236     0     0]
 [11242  2488   734     4]
 [    0     0  2580  1933]
 [    0     0  1617  7373]]
accuracy_score: 0.6652653927813164
precision_score: 0.7364807238169289
recall_score: 0.6652653927813164
f1_score: 0.6652653927813164
----------------------------------------
              precision    recall  f1-score   support

           0       0.63      0.99      0.77     19129
           1       0.91      0.17      0.29     14468
           2       0.52      0.57      0.55      4513
           3       0.79      0.82      0.81      8990

    accuracy                           0.67     47100
   macro avg       0.71      0.64      0.60     47100
weighted avg       0.74      0.67      0.61     47100

--------------------------------------------------------------------------------
----------------------------------------------------------------

# Final Models

In [None]:
models = {
    "Bagging-Classifier"  : BaggingClassifier(tree.DecisionTreeClassifier(random_state=1)),
    "Random-Forest-Classifier"      : RandomForestClassifier(n_estimators=100),
    "Decision-Tree-Classifier"      : DecisionTreeClassifier(),
    "Ada-Boost-Classifier" : AdaBoostClassifier(DecisionTreeClassifier(),algorithm="SAMME",n_estimators=100),
    "Extra-Trees-Classifier"       : ExtraTreesClassifier(n_estimators=100)
}

In [None]:
import joblib

for name, model in models.items():
    
    start = time.process_time()
    print(f'Using model: {name}')
    model.fit(x_train, y_train)
    print(f'Time: {time.process_time() - start}')

    print(f'Training Score: {model.score(x_train, y_train)}')
    print(f'Test Score: {model.score(x_test, y_test)}') 
    
    y_pred = model.predict(x_test)
    
    print(f'confusion_matrix:\n {confusion_matrix(y_test, y_pred)}')
    print(f'accuracy_score: {accuracy_score(y_test, y_pred)}')
    print(f'precision_score: {precision_score(y_test, y_pred,average="weighted")}')
    print(f'recall_score: {recall_score(y_test, y_pred,average="weighted")}')
    print(f'f1_score: {recall_score(y_test, y_pred,average="weighted")}')
    joblib.dump(model, f'{name}.h5')
    
    print('-'*80)
    print('-'*80)

joblib.dump(imputer, 'imputer.h5')

Using model: Bagging-Classifier
Time: 16.950640380000095
Training Score: 0.9635926651237315
Test Score: 0.9161571125265393
confusion_matrix:
 [[16623  2506     0     0]
 [ 1040 13353    63    12]
 [    0    35  4344   134]
 [    0     4   155  8831]]
accuracy_score: 0.9161571125265393
precision_score: 0.9192299474970436
recall_score: 0.9161571125265393
f1_score: 0.9161571125265393
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Using model: Random-Forest-Classifier
Time: 42.741678181
Training Score: 0.9643576419797044
Test Score: 0.9202335456475584
confusion_matrix:
 [[16773  2356     0     0]
 [  995 13394    66    13]
 [    0    14  4330   169]
 [    0     1   143  8846]]
accuracy_score: 0.9202335456475584
precision_score: 0.9227985572560078
recall_score: 0.9202335456475584
f1_score: 0.9202335456475584
----------------------------------------------------------------------

['imputer.h5']