In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
import seaborn as sns
import time, datetime
import re

In [2]:
data=pd.read_csv('./New_Clean.csv')

In [3]:
data.drop(['flight'], axis=1, inplace=True)

In [4]:
data.head

<bound method NDFrame.head of        day_of_week   airline source_city  departure_hour departure_category  \
0           Friday  SpiceJet       Delhi              18            Evening   
1           Friday  SpiceJet       Delhi               6      Early Morning   
2           Friday   AirAsia       Delhi               4      Early Morning   
3           Friday   Vistara       Delhi              10            Morning   
4           Friday   Vistara       Delhi               8            Morning   
...            ...       ...         ...             ...                ...   
300256    Thursday   Vistara     Chennai               9            Morning   
300257    Thursday   Vistara     Chennai              12          Afternoon   
300258    Thursday   Vistara     Chennai               7      Early Morning   
300259    Thursday   Vistara     Chennai               7      Early Morning   
300260    Thursday   Vistara     Chennai               9            Morning   

        stops  arriva

In [5]:
def cyclical_encoding(hour):
    # Encode hour using sine and cosine functions
    hour_sin = np.sin(2 * np.pi * hour / 24)
    hour_cos = np.cos(2 * np.pi * hour / 24)
    return hour_sin, hour_cos

# Apply cyclical encoding to 'departure_hour' and 'arrival_hour' columns
data['departure_hour_sin'], data['departure_hour_cos'] = zip(*data['departure_hour'].apply(cyclical_encoding))
data['arrival_hour_sin'], data['arrival_hour_cos'] = zip(*data['arrival_hour'].apply(cyclical_encoding))
#code for drop duplicats rows(rows where all the features/columns are the same)
data = data.drop_duplicates()

In [6]:
data.columns

Index(['day_of_week', 'airline', 'source_city', 'departure_hour',
       'departure_category', 'stops', 'arrival_hour', 'arrival_category',
       'destination_city', 'class', 'duration', 'days_left', 'price',
       'departure_hour_sin', 'departure_hour_cos', 'arrival_hour_sin',
       'arrival_hour_cos'],
      dtype='object')

In [7]:
days_of_week_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Create an OrdinalEncoder object with specified categories
encoder = OrdinalEncoder(categories=[days_of_week_order])

# Encode the 'day_of_week' column
data['day_of_week'] = encoder.fit_transform(data[['day_of_week']]).astype(int)

# Create a dictionary to store the mapping of original days to encoded integers
days_mapping = dict(zip(days_of_week_order, range(len(days_of_week_order))))

# Display the mapping
print("Encoding Mapping:")
print(days_mapping)

Encoding Mapping:
{'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}


In [8]:
encoder = OrdinalEncoder()

# Fit and transform the 'airline' column
encoded_values = encoder.fit_transform(data[['airline']]).astype(int)

# Store the encoded values back into the 'airline' column in the DataFrame
data['airline'] = encoded_values

# Get the unique airlines in the original data
unique_airlines = encoder.categories_[0]

# Create a dictionary to store the mapping of original airlines to encoded integers
airline_mapping = dict(zip(unique_airlines, range(len(unique_airlines))))

# Display the mapping
print("Encoding Mapping for Airline:")
print(airline_mapping)

Encoding Mapping for Airline:
{'Air India': 0, 'AirAsia': 1, 'GO FIRST': 2, 'Indigo': 3, 'SpiceJet': 4, 'StarAir': 5, 'Trujet': 6, 'Vistara': 7}


In [9]:
print(data['source_city'].unique())

['Delhi' 'Mumbai' 'Bangalore' 'Kolkata' 'Hyderabad' 'Chennai']


In [10]:
print(data['destination_city'].unique())

['Mumbai' 'Bangalore' 'Kolkata' 'Hyderabad' 'Chennai' 'Delhi']


In [11]:
unique_cities = sorted(set(data['source_city'].unique()) | set(data['destination_city'].unique()))

# Create an OrdinalEncoder object
encoder = OrdinalEncoder(categories=[unique_cities, unique_cities])

# Fit and transform both 'source_city' and 'destination_city' columns together
encoded_values = encoder.fit_transform(data[['source_city', 'destination_city']]).astype(int)

# Replace the original 'source_city' and 'destination_city' columns with the encoded values
data['source_city'] = encoded_values[:, 0]
data['destination_city'] = encoded_values[:, 1]

# Get the unique cities in the original data
unique_cities = sorted(unique_cities)

# Create a dictionary to store the mapping of original cities to encoded integers
encoding_mapping_cities = dict(zip(unique_cities, range(len(unique_cities))))

# Display the mapping
print("\nEncoding Mapping for Cities:")
print(encoding_mapping_cities)


Encoding Mapping for Cities:
{'Bangalore': 0, 'Chennai': 1, 'Delhi': 2, 'Hyderabad': 3, 'Kolkata': 4, 'Mumbai': 5}


In [12]:
encoder = OrdinalEncoder()

# Fit and transform the 'class' column
encoded_values = encoder.fit_transform(data[['class']]).astype(int)

# Store the encoded values back into the 'class' column in the DataFrame
data['class'] = encoded_values

# Get the unique classes in the original data
unique_classes = encoder.categories_[0]

# Create a dictionary to store the mapping of original classes to encoded integers
class_mapping = dict(zip(unique_classes, range(len(unique_classes))))

# Display the mapping
print("Encoding Mapping for Class:")
print(class_mapping)

Encoding Mapping for Class:
{'business': 0, 'economy': 1}


In [13]:
# Define the order of categories
categories_order = [ 'Early Morning', 'Morning', 'Evening','Afternoon', 'Night', 'Late Night']

# Assuming 'data' is your DataFrame with the specified columns

# Combine the unique values of both 'departure_category' and 'arrival_category' to get all unique categories
unique_categories = sorted(set(data['departure_category'].unique()) | set(data['arrival_category'].unique()))

# Create an OrdinalEncoder object with the specified categories
encoder = OrdinalEncoder(categories=[categories_order, categories_order])

# Fit and transform both 'departure_category' and 'arrival_category' columns together
encoded_values = encoder.fit_transform(data[['departure_category', 'arrival_category']]).astype(int)

# Replace the original 'departure_category' and 'arrival_category' columns with the encoded values
data['departure_category'] = encoded_values[:, 0]
data['arrival_category'] = encoded_values[:, 1]

# Create a dictionary to store the mapping of original categories to encoded integers
mapping_categories = dict(zip(categories_order, range(len(categories_order))))

# Display the mapping
print("\nEncoding Mapping for Categories:")
print(mapping_categories)


Encoding Mapping for Categories:
{'Early Morning': 0, 'Morning': 1, 'Evening': 2, 'Afternoon': 3, 'Night': 4, 'Late Night': 5}


In [14]:
data.head(1)

Unnamed: 0,day_of_week,airline,source_city,departure_hour,departure_category,stops,arrival_hour,arrival_category,destination_city,class,duration,days_left,price,departure_hour_sin,departure_hour_cos,arrival_hour_sin,arrival_hour_cos
0,4,4,2,18,2,0,21,4,5,1,130,1,5953,-1.0,-1.83697e-16,-0.707107,0.707107


In [15]:
for column in data.columns:
    unique_values = data[column].unique()
    print(f"Unique values in column '{column}':")
    print(unique_values)
    print()

Unique values in column 'day_of_week':
[4 5 6 0 1 2 3]

Unique values in column 'airline':
[4 1 7 2 3 0 6 5]

Unique values in column 'source_city':
[2 5 0 4 3 1]

Unique values in column 'departure_hour':
[18  6  4 10  8 11  9 14 15  7 12 19  5 17 13 16 21 20 22 23  2  0  1  3]

Unique values in column 'departure_category':
[2 0 1 3 4 5]

Unique values in column 'stops':
[0 1 2]

Unique values in column 'arrival_hour':
[21  8  6 12 11 14 16 10 17  9  7 18 20 19 22 13 23 15  0  1  2  4  5  3]

Unique values in column 'arrival_category':
[4 1 0 3 2 5]

Unique values in column 'destination_city':
[5 0 4 3 1 2]

Unique values in column 'class':
[1 0]

Unique values in column 'duration':
[ 130  140  135  125  735  980  705  870  940  225  150  350  480  360
  880  970 1080 1390 1450  530  270  915  660 1145 1370 1585 1065 1175
 1600  910 1250  685 1335 1560 1305  230  265  460  500  625 1425 1170
  390  745 1265 1690 1695  555 1075  425  830  455  950 1465  250  255
  305 1760 1020 1630 14

In [16]:
print("Encoding Mapping:")
print(days_mapping)

Encoding Mapping:
{'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}


In [17]:
print("Encoding Mapping for Airline:")
print(airline_mapping)

Encoding Mapping for Airline:
{'Air India': 0, 'AirAsia': 1, 'GO FIRST': 2, 'Indigo': 3, 'SpiceJet': 4, 'StarAir': 5, 'Trujet': 6, 'Vistara': 7}


In [18]:
print("\nEncoding Mapping for Cities:")
print(encoding_mapping_cities)


Encoding Mapping for Cities:
{'Bangalore': 0, 'Chennai': 1, 'Delhi': 2, 'Hyderabad': 3, 'Kolkata': 4, 'Mumbai': 5}


In [19]:
print("Encoding Mapping for Class:")
print(class_mapping)

Encoding Mapping for Class:
{'business': 0, 'economy': 1}


In [21]:
print("\nEncoding Mapping for Categories:")
print(mapping_categories)


Encoding Mapping for Categories:
{'Early Morning': 0, 'Morning': 1, 'Evening': 2, 'Afternoon': 3, 'Night': 4, 'Late Night': 5}
