In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
import time, datetime
import re

In [2]:
data=pd.read_csv('./New_Clean.csv')

In [3]:
def cyclical_encoding(hour):
    # Encode hour using sine and cosine functions
    hour_sin = np.sin(2 * np.pi * hour / 24)
    hour_cos = np.cos(2 * np.pi * hour / 24)
    return hour_sin, hour_cos

# Apply cyclical encoding to 'departure_hour' and 'arrival_hour' columns
data['departure_hour_sin'], data['departure_hour_cos'] = zip(*data['departure_hour'].apply(cyclical_encoding))
data['arrival_hour_sin'], data['arrival_hour_cos'] = zip(*data['arrival_hour'].apply(cyclical_encoding))


In [4]:
data.head(1)

Unnamed: 0,day_of_week,airline,flight,source_city,departure_hour,departure_category,stops,arrival_hour,arrival_category,destination_city,class,duration,days_left,price,departure_hour_sin,departure_hour_cos,arrival_hour_sin,arrival_hour_cos
0,Friday,SpiceJet,SG-8709,Delhi,18,Evening,0,21,Night,Mumbai,economy,130,1,5953,-1.0,-1.83697e-16,-0.707107,0.707107


In [5]:
data['airline'].unique()

array(['SpiceJet', 'AirAsia', 'Vistara', 'GO FIRST', 'Indigo',
       'Air India', 'Trujet', 'StarAir'], dtype=object)

In [6]:
encoder = OneHotEncoder(sparse_output=False)

In [7]:
# Fit and transform the 'airline' column
airline_encoded = encoder.fit_transform(data[['airline']]).astype(int)
# Get the feature names from the encoder
airline_names = encoder.get_feature_names_out(['airline'])
# Convert the encoded array into a DataFrame
airline_encoded_df = pd.DataFrame(airline_encoded, columns=airline_names)

# Concatenate the original DataFrame with the encoded column
data = pd.concat([data, airline_encoded_df], axis=1)

In [8]:
data.drop(['flight', 'airline'], axis=1, inplace=True)

In [9]:
data.columns

Index(['day_of_week', 'source_city', 'departure_hour', 'departure_category',
       'stops', 'arrival_hour', 'arrival_category', 'destination_city',
       'class', 'duration', 'days_left', 'price', 'departure_hour_sin',
       'departure_hour_cos', 'arrival_hour_sin', 'arrival_hour_cos',
       'airline_Air India', 'airline_AirAsia', 'airline_GO FIRST',
       'airline_Indigo', 'airline_SpiceJet', 'airline_StarAir',
       'airline_Trujet', 'airline_Vistara'],
      dtype='object')

In [10]:
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the 'source_city' column
source_encoded = encoder.fit_transform(data[['source_city']]).astype(int)
# Get the feature names from the encoder
source_city_names = encoder.get_feature_names_out(['source_city'])
# Convert the encoded array into a DataFrame
source_encoded_df = pd.DataFrame(source_encoded, columns=source_city_names)

# Fit and transform the 'destination_city' column
destination_encoded = encoder.fit_transform(data[['destination_city']]).astype(int)
# Get the feature names from the encoder
destination_city_names = encoder.get_feature_names_out(['destination_city'])
# Convert the encoded array into a DataFrame
destination_encoded_df = pd.DataFrame(destination_encoded, columns=destination_city_names)

# Concatenate the original DataFrame with the encoded columns
data = pd.concat([data, source_encoded_df, destination_encoded_df], axis=1)


In [11]:
data.drop(['source_city', 'destination_city'], axis=1, inplace=True)

In [12]:
data.columns

Index(['day_of_week', 'departure_hour', 'departure_category', 'stops',
       'arrival_hour', 'arrival_category', 'class', 'duration', 'days_left',
       'price', 'departure_hour_sin', 'departure_hour_cos', 'arrival_hour_sin',
       'arrival_hour_cos', 'airline_Air India', 'airline_AirAsia',
       'airline_GO FIRST', 'airline_Indigo', 'airline_SpiceJet',
       'airline_StarAir', 'airline_Trujet', 'airline_Vistara',
       'source_city_Bangalore', 'source_city_Chennai', 'source_city_Delhi',
       'source_city_Hyderabad', 'source_city_Kolkata', 'source_city_Mumbai',
       'destination_city_Bangalore', 'destination_city_Chennai',
       'destination_city_Delhi', 'destination_city_Hyderabad',
       'destination_city_Kolkata', 'destination_city_Mumbai'],
      dtype='object')

In [13]:
# Fit and transform the 'day_of_week' column
day_of_week_encoded = encoder.fit_transform(data[['day_of_week']]).astype(int)
# Get the feature names from the encoder
day_of_week_names = encoder.get_feature_names_out(['day_of_week'])
# Convert the encoded array into a DataFrame
day_of_week_encoded_df = pd.DataFrame(day_of_week_encoded, columns=day_of_week_names)

# Concatenate the original DataFrame with the encoded column
data = pd.concat([data, day_of_week_encoded_df], axis=1)


In [14]:
data.drop(['day_of_week'], axis=1, inplace=True)

In [15]:
data.columns

Index(['departure_hour', 'departure_category', 'stops', 'arrival_hour',
       'arrival_category', 'class', 'duration', 'days_left', 'price',
       'departure_hour_sin', 'departure_hour_cos', 'arrival_hour_sin',
       'arrival_hour_cos', 'airline_Air India', 'airline_AirAsia',
       'airline_GO FIRST', 'airline_Indigo', 'airline_SpiceJet',
       'airline_StarAir', 'airline_Trujet', 'airline_Vistara',
       'source_city_Bangalore', 'source_city_Chennai', 'source_city_Delhi',
       'source_city_Hyderabad', 'source_city_Kolkata', 'source_city_Mumbai',
       'destination_city_Bangalore', 'destination_city_Chennai',
       'destination_city_Delhi', 'destination_city_Hyderabad',
       'destination_city_Kolkata', 'destination_city_Mumbai',
       'day_of_week_Friday', 'day_of_week_Monday', 'day_of_week_Saturday',
       'day_of_week_Sunday', 'day_of_week_Thursday', 'day_of_week_Tuesday',
       'day_of_week_Wednesday'],
      dtype='object')

In [16]:
# Fit and transform the 'class' column
class_encoded = encoder.fit_transform(data[['class']]).astype(int)
# Get the feature names from the encoder
class_names = encoder.get_feature_names_out(['class'])
# Convert the encoded array into a DataFrame
class_encoded_df = pd.DataFrame(class_encoded, columns=class_names)

# Concatenate the original DataFrame with the encoded column
data = pd.concat([data, class_encoded_df], axis=1)

In [17]:
data.drop(['class'], axis=1, inplace=True)

In [18]:
data.columns

Index(['departure_hour', 'departure_category', 'stops', 'arrival_hour',
       'arrival_category', 'duration', 'days_left', 'price',
       'departure_hour_sin', 'departure_hour_cos', 'arrival_hour_sin',
       'arrival_hour_cos', 'airline_Air India', 'airline_AirAsia',
       'airline_GO FIRST', 'airline_Indigo', 'airline_SpiceJet',
       'airline_StarAir', 'airline_Trujet', 'airline_Vistara',
       'source_city_Bangalore', 'source_city_Chennai', 'source_city_Delhi',
       'source_city_Hyderabad', 'source_city_Kolkata', 'source_city_Mumbai',
       'destination_city_Bangalore', 'destination_city_Chennai',
       'destination_city_Delhi', 'destination_city_Hyderabad',
       'destination_city_Kolkata', 'destination_city_Mumbai',
       'day_of_week_Friday', 'day_of_week_Monday', 'day_of_week_Saturday',
       'day_of_week_Sunday', 'day_of_week_Thursday', 'day_of_week_Tuesday',
       'day_of_week_Wednesday', 'class_business', 'class_economy'],
      dtype='object')

In [19]:
data.shape

(300261, 41)