In [1]:
#from google.colab import drive
#drive.mount('/content/drive/')

In [2]:
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns 
import matplotlib.pyplot as plt
import ast
import warnings
import joblib
warnings.filterwarnings("ignore")

In [3]:
dataset = pd.read_csv('ClassificationTestSet.csv')
dataset.head()

Unnamed: 0,date,airline,ch_code,num_code,dep_time,time_taken,stop,arr_time,type,route,TicketCategory
0,8/3/2022,Air India,AI,430,9:55,11h 20m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,21:15,economy,"{'source': 'Chennai', 'destination': 'Mumbai'}",cheap
1,7/3/2022,Indigo,6E,926,18:30,04h 55m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,23:25,economy,"{'source': 'Delhi', 'destination': 'Mumbai'}",cheap
2,24-03-2022,Indigo,6E,6491,13:05,07h 40m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,20:45,economy,"{'source': 'Bangalore', 'destination': 'Mumbai'}",cheap
3,27-03-2022,Air India,AI,473,18:40,22h 25m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,17:05,economy,"{'source': 'Delhi', 'destination': 'Bangalore'}",cheap
4,7/3/2022,Indigo,6E,684,21:55,01h 15m,non-stop,23:10,economy,"{'source': 'Bangalore', 'destination': 'Hydera...",cheap


In [4]:
dataset['full_information_timestamp'] = 0
dataset['flight_day'] = 0
dataset['flight_month'] = 0
dataset['week_day_of_flight'] = 0
dataset['departure_time_of_the_day']=0
dataset['arrival_time_of_the_day']=0
dataset['num_of_hours_taken'] =0
dataset['num_of_minutes_taken'] = 0
dataset['num_of_stops'] = 0
dataset['source'] = 0
dataset['destination'] = 0
dataset['distance_between_2_cities'] = 0

In [5]:
def format_date(date):
    dashed_date = date.replace('/','-')
    return dashed_date

In [6]:
dataset['full_information_timestamp'] = dataset.date.apply(lambda date:format_date(date))

In [7]:
def extract_day_month(date):
    splitted_date = date.split('-')
    day = int(splitted_date[0])
    month =int(splitted_date[1])
    if splitted_date[1]== '2' or splitted_date[1]== '02':
        new_date_format = 'February ' + splitted_date[0] + ', '+ splitted_date[2]
    else :
        new_date_format = 'March ' + splitted_date[0] + ', '+ splitted_date[2]
    return day,month,new_date_format

In [8]:
dataset[['flight_day', 'flight_month','full_information_timestamp']] = pd.DataFrame(dataset.full_information_timestamp.apply(lambda date: extract_day_month(date)).tolist(), index=dataset.index)

In [9]:
dataset['full_information_timestamp'] = pd.to_datetime(dataset['full_information_timestamp'])

In [10]:
def extract_weekday(day):
    return day.day_name()

In [11]:
dataset['week_day_of_flight'] = dataset.full_information_timestamp.apply(lambda date:extract_weekday(date))

In [12]:
high_frequency_airlines =['Vistara','Air India']
def remove_low_frequency_airlines():
    dataset ['airline'] = [airline if airline in high_frequency_airlines else 'Other_airline' for airline in dataset['airline']]
   

In [13]:
remove_low_frequency_airlines()

In [14]:
def calculate_time_taken(time_taken):
    time = time_taken.split(' ')
    float_hour = time[0].split('.')
    hours = 0
    minutes = 0
    if(len(float_hour)>1):
        time[0] = float_hour[0] + 'h'
        time[1]= float_hour[1][:-1] + time[1]
    hours = int(time[0][:-1])
    if time[1][:-1] != '' : 
        minutes = ((int(time[1][:-1]))/100)*60
    else :
        minutes = 0
    return hours, minutes

In [15]:
dataset[['num_of_hours_taken', 'num_of_minutes_taken']] = pd.DataFrame(dataset.time_taken.apply(lambda time_taken: calculate_time_taken(time_taken)).tolist(), index=dataset.index)

In [16]:
def extract_hour(time):
    hour_minute = time.split(':')
    return int(hour_minute[0])

In [17]:
dataset['dep_time']= dataset.dep_time.apply(lambda time: extract_hour(time))

In [18]:
dataset['arr_time']= dataset.arr_time.apply(lambda time: extract_hour(time))

In [19]:
def categorize_time(hour):
    if hour <6:
        return 'Early morning'
    elif 6<= hour < 12 :
        return 'Morning'
    elif 12<= hour < 18:
        return 'Afternoon'
    else:
        return 'Evening'

In [20]:
dataset['departure_time_of_the_day'] = dataset.dep_time.apply(lambda hour: categorize_time(hour))

In [21]:
dataset['arrival_time_of_the_day'] = dataset.arr_time.apply(lambda hour: categorize_time(hour))

In [22]:
def split_num_of_stops(stop):
    if stop[:8] == 'non-stop':
        stop_count = 0
    elif stop[:6] == '1-stop':
        stop_count = 1
    else:
        stop_count = 2
    return stop_count

In [23]:
dataset['num_of_stops']= dataset.stop.apply(lambda stop:split_num_of_stops(stop))

In [24]:
dataset['type'] = [1 if kind_of_trip == 'business' else 0 for kind_of_trip in dataset['type']]

In [25]:
def split_route(route):
    path = ast.literal_eval(route)
    source = path['source']
    destination = path['destination']
    return source,destination

In [26]:
dataset[['source', 'destination']] = pd.DataFrame(dataset.route.apply(lambda route: split_route(route)).tolist(), index=dataset.index)

In [27]:
distances = dict()
distances['Mumbai','Kolkata'] = 2167
distances['Mumbai','Hyderabad'] = 721
distances['Mumbai','Chennai'] = 1344
distances['Mumbai','Bangalore'] = 995
distances['Mumbai','Delhi'] = 1439
distances['Delhi','Bangalore'] = 2169
distances['Delhi','Kolkata'] = 1554
distances['Delhi','Hyderabad'] = 1579
distances['Delhi','Chennai'] = 2202
distances['Bangalore','Kolkata']= 1560
distances['Bangalore','Hyderabad'] = 569
distances['Bangalore','Chennai'] = 348
distances['Kolkata','Hyderabad'] = 1489
distances['Kolkata','Chennai'] = 1663
distances['Hyderabad','Chennai'] = 628

In [28]:
def calculate_distance(source,destination):
    try :
        dist = distances[source,destination]
    except :
        dist = distances[destination,source]
    return dist

In [29]:
dataset['distance_between_2_cities'] = dataset.apply(lambda dataFrame: calculate_distance(dataFrame['source'], dataFrame['destination']), axis=1)
dataset['distance_between_2_cities'] = dataset['distance_between_2_cities']/2202

In [30]:
def encode_ticket_category(category):
    if category == 'cheap':
        return 0
    elif category == 'moderate':
        return 1
    elif category == 'expensive':
        return 2
    else :
        return 3

In [31]:
dataset['TicketCategory'] = dataset.TicketCategory.apply(lambda category: encode_ticket_category(category))

In [32]:
def one_hot_encode_airline(airline):
    vistara = 0
    air_india = 0
    other = 0
    if airline == 'Vistara':
        vistara = 1
    elif airline =='Air India':
        air_india = 1
    elif airline == 'Other_airline':
        other =1
    return vistara, air_india,other

In [33]:
dataset[['airline_Vistara', 'airline_Air India','airline_Other_airline']] = pd.DataFrame(dataset.airline.apply(lambda airline: one_hot_encode_airline(airline)).tolist(), index=dataset.index)

In [34]:
def one_hot_encode_source(source):
    bangalore = 0
    delhi =0
    hyderabad = 0
    kolkata =0
    if source =='Bangalore':
        bangalore =1
    elif source =='Delhi':
        delhi =1
    elif source =='Hyderabad':
        hyderabad = 1
    elif source =='Kolkata':
        kolkata =1
    return bangalore,delhi,hyderabad,kolkata

In [35]:
dataset[['source_Bangalore', 'source_Delhi','source_Hyderabad','source_Kolkata']] = pd.DataFrame(dataset.source.apply(lambda source: one_hot_encode_source(source)).tolist(), index=dataset.index)

In [36]:
def one_hot_encode_destination(destination):
    bangalore = 0
    delhi =0
    mumbai = 0
    kolkata =0
    chennai =0
    if destination =='Bangalore':
        bangalore =1
    elif destination =='Delhi':
        delhi =1
    elif destination =='Mumbai':
        mumbai = 1
    elif destination =='Kolkata':
        kolkata =1
    elif destination =='Chennai':
        chennai =1
    return bangalore,delhi,mumbai,kolkata,chennai

In [37]:
dataset[['destination_Bangalore', 'destination_Delhi','destination_Mumbai','destination_Kolkata','destination_Chennai']] = pd.DataFrame(dataset.destination.apply(lambda destination: one_hot_encode_destination(destination)).tolist(), index=dataset.index)

In [38]:
def one_hot_encode_day(day):
    tuesday=0
    wednesday=0
    saturday = 0
    sunday = 0
    if day =='Tuesday' :
        tuesday = 1
    elif day =='Wednesday' :
        wednesday = 1
    elif day =='Saturday' :
        saturday =1
    elif day =='Sunday':
        sunday = 1
    return tuesday,wednesday,saturday,sunday

In [39]:
dataset[['week_day_of_flight_Tuesday', 'week_day_of_flight_Wednesday','week_day_of_flight_Saturday','week_day_of_flight_Sunday']] = pd.DataFrame(dataset.week_day_of_flight.apply(lambda day: one_hot_encode_day(day)).tolist(), index=dataset.index)

In [40]:
def one_hot_encode_dep_time(time):
    afternoon =0
    morning = 0
    early = 0
    evening = 0
    if time =='Early morning':
        early =1
    elif time == 'Afternoon':
        afternoon = 1
    elif time == 'Morning':
        morning =1
    elif time=='Evening':
        evening = 1
    return early,morning,afternoon,evening

In [41]:
dataset[['departure_time_of_the_day_Early morning', 'departure_time_of_the_day_Morning','departure_time_of_the_day_Afternoon','departure_time_of_the_day_Evening']] = pd.DataFrame(dataset.departure_time_of_the_day.apply(lambda time: one_hot_encode_dep_time(time)).tolist(), index=dataset.index)

In [42]:
def one_hot_encode_arr_time(time):
    afternoon =0
    early = 0
    evening = 0
    if time =='Early morning':
        early =1
    elif time == 'Afternoon':
        afternoon = 1
    elif time=='Evening':
        evening = 1
    return early,afternoon,evening

In [43]:
dataset[['arrival_time_of_the_day_Early morning', 'arrival_time_of_the_day_Afternoon','arrival_time_of_the_day_Evening']] = pd.DataFrame(dataset.arrival_time_of_the_day.apply(lambda time: one_hot_encode_arr_time(time)).tolist(), index=dataset.index)

In [44]:
features = ['num_code', 'dep_time', 'arr_time', 'type', 'TicketCategory',
       'flight_month', 'num_of_hours_taken', 'num_of_minutes_taken',
       'num_of_stops', 'distance_between_2_cities', 'airline_Air India',
       'airline_Other_airline', 'airline_Vistara', 'source_Bangalore',
       'source_Delhi', 'source_Hyderabad', 'source_Kolkata',
       'destination_Bangalore', 'destination_Chennai', 'destination_Delhi',
       'destination_Kolkata', 'destination_Mumbai',
       'week_day_of_flight_Saturday', 'week_day_of_flight_Sunday',
       'week_day_of_flight_Tuesday', 'week_day_of_flight_Wednesday',
       'departure_time_of_the_day_Afternoon',
       'departure_time_of_the_day_Early morning',
       'departure_time_of_the_day_Evening',
       'departure_time_of_the_day_Morning',
       'arrival_time_of_the_day_Afternoon',
       'arrival_time_of_the_day_Early morning',
       'arrival_time_of_the_day_Evening']

In [45]:
for column in dataset.columns:
    if column not in features:
        dataset.drop(columns=column,axis=1,inplace=True)

In [46]:
dataset = dataset[['num_code', 'dep_time', 'arr_time', 'type', 'TicketCategory',
       'flight_month', 'num_of_hours_taken', 'num_of_minutes_taken',
       'num_of_stops', 'distance_between_2_cities', 'airline_Air India',
       'airline_Other_airline', 'airline_Vistara', 'source_Bangalore',
       'source_Delhi', 'source_Hyderabad', 'source_Kolkata',
       'destination_Bangalore', 'destination_Chennai', 'destination_Delhi',
       'destination_Kolkata', 'destination_Mumbai',
       'week_day_of_flight_Saturday', 'week_day_of_flight_Sunday',
       'week_day_of_flight_Tuesday', 'week_day_of_flight_Wednesday',
       'departure_time_of_the_day_Afternoon',
       'departure_time_of_the_day_Early morning',
       'departure_time_of_the_day_Evening',
       'departure_time_of_the_day_Morning',
       'arrival_time_of_the_day_Afternoon',
       'arrival_time_of_the_day_Early morning',
       'arrival_time_of_the_day_Evening']]

In [47]:
dataset

Unnamed: 0,num_code,dep_time,arr_time,type,TicketCategory,flight_month,num_of_hours_taken,num_of_minutes_taken,num_of_stops,distance_between_2_cities,...,week_day_of_flight_Sunday,week_day_of_flight_Tuesday,week_day_of_flight_Wednesday,departure_time_of_the_day_Afternoon,departure_time_of_the_day_Early morning,departure_time_of_the_day_Evening,departure_time_of_the_day_Morning,arrival_time_of_the_day_Afternoon,arrival_time_of_the_day_Early morning,arrival_time_of_the_day_Evening
0,430,9,21,0,0,3,11,12.0,1,0.610354,...,0,1,0,0,0,0,1,0,0,1
1,926,18,23,0,0,3,4,33.0,1,0.653497,...,0,0,0,0,0,1,0,0,0,1
2,6491,13,20,0,0,3,7,24.0,1,0.451862,...,0,0,0,1,0,0,0,0,0,1
3,473,18,17,0,0,3,22,15.0,1,0.985014,...,1,0,0,0,0,1,0,1,0,0
4,684,21,23,0,0,3,1,9.0,0,0.258401,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,9894,8,10,0,1,2,25,24.0,1,0.610354,...,0,1,0,0,0,0,1,0,0,0
19996,402,10,8,0,1,3,22,18.0,1,0.755223,...,0,0,0,0,0,0,1,0,0,0
19997,930,7,22,0,1,2,15,6.0,1,0.451862,...,0,0,0,0,0,0,1,0,0,1
19998,537,16,9,1,2,3,17,12.0,1,0.705722,...,0,0,0,1,0,0,0,0,0,0


In [48]:
label = dataset['TicketCategory']
dataset.drop(columns='TicketCategory',axis=1,inplace=True)
features = dataset.copy()

In [49]:
loaded_model = joblib.load('Ada-Boost-Classifier - Copy.h5')
loaded_model.score(features,label)

0.91015

In [50]:
loaded_model = joblib.load('Random-Forest-Classifier.h5')
loaded_model.score(features,label)

0.9123

In [51]:
loaded_model = joblib.load('Decision-Tree-Classifier - Copy.h5')
loaded_model.score(features,label)

0.9045

In [52]:
loaded_model = joblib.load('Extra-Trees-Classifier - Copy.h5')
loaded_model.score(features,label)

0.90535