## Setup

**python -m venv ./venv**

Run that in your terminal to set up the virtual environment, then run the below command. If you don't care about using a virtual environment, just run the below command to install the required packages for your current version of Python.

In [None]:
%pip install pandas numpy matplotlib seaborn scikit-learn tensorflow

You can also run the following:

In [None]:
%pip install -r requirements.txt

## Process 1: Data Cleaning

In [None]:
#import data
import pandas as pd
import numpy as np

data = pd.read_csv('./Raw Data/Clean_Dataset.csv')
data.head()

In [None]:
#drop 1st column
data = data.drop(data.columns[0], axis=1)
data.head()

In [None]:
#CREATE NEW COLUMN TO CONVERT PRICES TO USD

#conversion rate from INR to USD
conversion_rate = 0.012

#function to convert price from INR to USD
def convert_to_usd(price_inr):
    return price_inr * conversion_rate

#create new column for USD price, applying conversion and round to the nearest cent
data['priceUSD'] = data['price'].apply(convert_to_usd).apply(lambda x: round(x, 2))

#rename current price column to indicate it is INR (Indian Rupee)
data = data.rename(columns={'price': 'priceINR'})

data.head()

In [None]:
#CONVERT STOPS VALUES FROM STRING TO NUMERICAL

#mapping dictionary
number_mapping = {'zero': 0, 'one': 1, 'two_or_more': 2}

# Convert string versions of numbers to numerical values
data['stops'] = data['stops'].map(number_mapping)

data.head()

** Insights **

import matplotlib.pyplot as plt
import seaborn as sns

# Exploring the relationship between airline and price with a box plot
plt.title('Price by Airline')
sns.boxplot(x='airline', y='priceUSD', data=data)
plt.show()

# Count the number of datapoints for each airline and display in a chart to see if the number of flights in the dataset is influencing the price
airline_counts = data['airline'].value_counts()
plt.title('Number of Flights by Airline')
sns.barplot(x=airline_counts.index, y=airline_counts.values)
plt.show()
# Exploring the relationship between source_city and price with a box plot
plt.title('Price by Source City')
sns.boxplot(x='source_city', y='priceUSD', data=data)
plt.show()

# Exploring the relationship between destination_city and price with a box plot
plt.title('Price by Destination City')
sns.boxplot(x='destination_city', y='priceUSD', data=data)
plt.show()

# Neither of these are very fruitful, so we will not use these columns in our report
# Exploring the relationship between stops and price with a box plot
plt.title('Price by Number of Stops')
sns.boxplot(x='stops', y='priceUSD', data=data)
#add a label for the median price for each number of stops
medians = data.groupby('stops')['priceUSD'].median()
for xtick in plt.gca().get_xticks():
    plt.text(xtick, medians[xtick], medians[xtick], horizontalalignment='center', color='w')
plt.show()

# This is interesting because in the US, more layovers usually implies a cheaper flight, but a nonstop flight is cheaper in India. This is likely due to the fact that the dataset is limited to domestic flights in India, and the number of stops relates to the distance traveled.
# Exploring the relationship between departure_time and price with a box plot
plt.title('Price by Departure Time')
sns.boxplot(x='departure_time', y='priceUSD', data=data)
medians = data.groupby('departure_time')['priceUSD'].median()
for xtick in plt.gca().get_xticks():
    plt.text(xtick, medians[xtick], medians[xtick], horizontalalignment='center', color='w')
plt.show()

# Display how many of each departure time there are in the dataset with a bar graph
departure_time_counts = data['departure_time'].value_counts()
plt.title('Number of Flights by Departure Time')
sns.barplot(x=departure_time_counts.index, y=departure_time_counts.values)
plt.show()

# This is interesting because the price of flights departing in the afternoon is the cheapest, which is unexpected
# Exploring the relationship between duration and price with a scatter plot.
plt.title('Price by Duration')
sns.regplot(x='duration', y='priceUSD', data=data, scatter_kws={'s': 1})
plt.show()
# This plot is not that conclusive and would be better suited to a regression model
# Exploring the relationship between days_left and price with a scatter plot.
plt.title('Price by Days Left')
sns.regplot(x='days_left', y='priceUSD', data=data, scatter_kws={'s': 1})
plt.show()

# The trend shows that the further from the departure date, the cheaper the flight. This is expected, as flights are usually cheaper when booked in advance, but it would interesting to see if this trend holds for flights booked within a week of the departure date.
# However, this plot is not that conclusive and would be better suited to a regression model

## Process 2: Building the models
# necessary imports for the next steps
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# this part encodes categorical variables using OneHotEncoder
encoder = OneHotEncoder(sparse=False, drop='first', handle_unknown='ignore')
categoricalFeatures = ['airline', 'sourceCity', 'departureTime', 'stops', 'arrivalTime', 'destinationCity', 'class']
encodedFeatures = encoder.fit_transform(data[categoricalFeatures])

# this part concatenates encoded categorical features with numerical features
numericalFeatures = data[['duration', 'daysLeft', 'priceUSD']].to_numpy()
features = np.concatenate([encodedFeatures, numericalFeatures], axis=1)

# this part splits the data into features and target variable
X = features[:, :-1]  # all features except priceUSD
y = features[:, -1]  # target variable is the price in USD

# this part splits the dataset into training and testing set
XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size=0.2, random_state=42)

# linear regression model
linearReg = LinearRegression()
linearReg.fit(XTrain, yTrain)
yPredLr = linearReg.predict(XTest)

# evaluate linear regression model
print("Linear Regression Model Performance:")
print("mean squared error (MSE):", mean_squared_error(yTest, yPredLr))
print("coefficient of determination (R^2):", r2_score(yTest, yPredLr))

# random forest regressor model
randomForestReg = RandomForestRegressor(n_estimators=100, random_state=42)
randomForestReg.fit(XTrain, yTrain)
yPredRf = randomForestReg.predict(XTest)

# evaluate random forest regressor model
print("\nRandom Forest Regressor Model Performance:")
print("mean squared error (MSE):", mean_squared_error(yTest, yPredRf))
print("coefficient of determination (R^2):", r2_score(yTest, yPredRf))

import numpy as np

categoricalFeatures = ['airline', 'sourceCity', 'departureTime', 'stops', 'arrivalTime', 'destinationCity', 'class']
# the 'duration' and 'daysLeft' features are numerical and handled separately

# adjusted 'newData' dictionary to use string representation for 'stops'
newData = {
    'airline': 'Vistara',
    'sourceCity': 'Delhi',
    'departureTime': 'Morning',
    'stops': 'zero',  # adjusted to string to match training data
    'arrivalTime': 'Afternoon',
    'destinationCity': 'Mumbai',
    'class': 'Economy',
    'duration': 2.5,
    'daysLeft': 10
}


def predictPrice(newData, model):
    # convert 'stops' in newData to string if it's numeric
    stopsMapping = {0: 'zero', 1: 'one', 2: 'twoOrMore'}
    if newData['stops'] in stopsMapping:
        newData['stops'] = stopsMapping[newData['stops']]

    # construct the input data for prediction by matching the training data structure
    newDataProcessed = [newData[feature] for feature in categoricalFeatures]

    newDataEncoded = encoder.transform([newDataProcessed])

    completeFeatures = np.hstack((newDataEncoded, [[newData['duration'], newData['daysLeft']]]))

    # predict using the provided model
    predictedPrice = model.predict(completeFeatures)
    return predictedPrice[0]


linearRegPredictedPrice = predictPrice(newData, linearReg)
print(f"Predicted Flight Price (LinReg): USD {linearRegPredictedPrice:.2f}")

randomForestRegPredictedPrice = predictPrice(newData, randomForestReg)
print(f"Predicted Flight Price (RFReg): USD {randomForestRegPredictedPrice:.2f}")
## Process 3: Testing and improving the models
%pip install xgboost
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# Initialize an XGBoost regressor object
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror',
                           colsample_bytree=0.3,
                           learning_rate=0.1,
                           max_depth=5,
                           alpha=10,
                           n_estimators=100)

# Fit the regressor to the training set
xgb_reg.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_reg.predict(X_test)

# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBoost Regressor Model Performance:")
print("Mean Squared Error (MSE):", mse_xgb)
print("Coefficient of Determination (R^2):", r2_xgb)
from sklearn.model_selection import cross_val_score

linearScores = cross_val_score(linearReg, X, y, cv=5, scoring='neg_mean_squared_error')
linearRmseScores = np.sqrt(-linearScores)

forestScores = cross_val_score(randomForestReg, X, y, cv=5, scoring='neg_mean_squared_error')
forestRmseScores = np.sqrt(-forestScores)

print("Linear Regression RMSE scores:", linearRmseScores)
print("Random Forest Regressor RMSE scores:", forestRmseScores)
from sklearn.model_selection import cross_validate

scoring = ['neg_mean_squared_error', 'r2']
linearResults = cross_validate(linearReg, X, y, cv=5, scoring=scoring, return_train_score=True)
forestResults = cross_validate(randomForestReg, X, y, cv=5, scoring=scoring, return_train_score=True)

linearRmse = np.sqrt(-linearResults['test_neg_mean_squared_error'].mean())
forestRmse = np.sqrt(-forestResults['test_neg_mean_squared_error'].mean())

print("Linear Regression:")
print("Average RMSE:", linearRmse)
print("Average R^2:", linearResults['test_r2'].mean())

print("\nRandom Forest Regressor:")
print("Average RMSE:", forestRmse)
print("Average R^2:", forestResults['test_r2'].mean())
## Process 4 (extra): Using GridSearch

*This
part
took
too
long
to
run *
from sklearn.model_selection import GridSearchCV

paramGrid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

gridSearch = GridSearchCV(randomForestReg, paramGrid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
gridSearch.fit(XTrain, yTrain)

print("Best parameters:", gridSearch.best_params_)
bestModel = gridSearch.best_estimator_
finalPredictions = bestModel.predict(XTest)
finalMse = mean_squared_error(yTest, finalPredictions)
finalRmse = np.sqrt(finalMse)

print("Final RMSE on Test Set:", finalRmse)