In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump


In [2]:
flights = pd.read_csv("../Data/flights.csv")

In [3]:
flights.head()

Unnamed: 0,travelCode,userCode,from,to,flightType,price,time,distance,agency,date
0,0,0,Recife (PE),Florianopolis (SC),firstClass,1434.38,1.76,676.53,FlyingDrops,09/26/2019
1,0,0,Florianopolis (SC),Recife (PE),firstClass,1292.29,1.76,676.53,FlyingDrops,09/30/2019
2,1,0,Brasilia (DF),Florianopolis (SC),firstClass,1487.52,1.66,637.56,CloudFy,10/03/2019
3,1,0,Florianopolis (SC),Brasilia (DF),firstClass,1127.36,1.66,637.56,CloudFy,10/04/2019
4,2,0,Aracaju (SE),Salvador (BH),firstClass,1684.05,2.16,830.86,CloudFy,10/10/2019


In [None]:
flights.shape

In [None]:
flights.info()

In [None]:
flights.duplicated().sum()

In [None]:
flights.columns

In [None]:
flights.describe()

In [4]:
for i in flights.columns:
  print(f'Number of unique values in {i} is {flights[i].nunique()}')

Number of unique values in travelCode is 135944
Number of unique values in userCode is 1335
Number of unique values in from is 9
Number of unique values in to is 9
Number of unique values in flightType is 3
Number of unique values in price is 490
Number of unique values in time is 33
Number of unique values in distance is 35
Number of unique values in agency is 3
Number of unique values in date is 999


In [5]:
# Printing unique values for categorical columns
for i in flights.select_dtypes(include=['object']).columns:
  print(f'Unique values in {i} are {flights[i].unique()}')

Unique values in from are ['Recife (PE)' 'Florianopolis (SC)' 'Brasilia (DF)' 'Aracaju (SE)'
 'Salvador (BH)' 'Campo Grande (MS)' 'Sao Paulo (SP)' 'Natal (RN)'
 'Rio de Janeiro (RJ)']
Unique values in to are ['Florianopolis (SC)' 'Recife (PE)' 'Brasilia (DF)' 'Salvador (BH)'
 'Aracaju (SE)' 'Campo Grande (MS)' 'Sao Paulo (SP)' 'Natal (RN)'
 'Rio de Janeiro (RJ)']
Unique values in flightType are ['firstClass' 'economic' 'premium']
Unique values in agency are ['FlyingDrops' 'CloudFy' 'Rainbow']
Unique values in date are ['09/26/2019' '09/30/2019' '10/03/2019' '10/04/2019' '10/10/2019'
 '10/12/2019' '10/17/2019' '10/20/2019' '10/24/2019' '10/26/2019'
 '10/31/2019' '11/01/2019' '11/07/2019' '11/10/2019' '11/14/2019'
 '11/17/2019' '11/21/2019' '11/24/2019' '11/28/2019' '11/30/2019'
 '12/05/2019' '12/06/2019' '12/12/2019' '12/16/2019' '12/19/2019'
 '12/20/2019' '12/26/2019' '12/27/2019' '01/02/2020' '01/04/2020'
 '01/09/2020' '01/11/2020' '01/16/2020' '01/18/2020' '01/23/2020'
 '01/24/2020' 

In [None]:
flights['date'] = pd.to_datetime(flights['date'], format='%m/%d/%Y')

In [None]:
# Import label encoder from sklearn
from sklearn.preprocessing import LabelEncoder
# Create an instance of label encoder
le = LabelEncoder()
# Apply the label encoder to the 'airline' column
flights['from'] = le.fit_transform(flights['from'])
flights['to'] = le.fit_transform(flights['to'])
flights['agency'] = le.fit_transform(flights['agency'])
flights['flightType'] = le.fit_transform(flights['flightType'])

In [None]:
# Convert 'date' column to day, month, and year
flights['day'] = flights['date'].dt.day
flights['month'] = flights['date'].dt.month
flights['year'] = flights['date'].dt.year

In [None]:
flights.drop('date', axis=1, inplace=True)
flights.drop('travelCode', axis=1, inplace=True)
flights.drop('userCode', axis=1, inplace=True)


In [None]:
flights.head()

In [None]:
flights.dtypes

In [None]:
from sklearn.model_selection import train_test_split
X = flights.drop('price', axis=1)
y = flights['price']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)

In [None]:
# using Xgboost for feature importance
import xgboost as xgb
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
importances = xgb_model.feature_importances_
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance')
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Using time, flight type, to,from,distance as features
X = flights[['day', 'month', 'year', 'flightType', 'to', 'from', 'distance']]
y = flights['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
accuracy = xgb_model.score(X_test, y_test)
print(f'Accuracy: {accuracy}')

In [None]:
# Save the model to disk
dump(xgb_model, 'flight_model.pkl')
print("Model saved as flight_model.pkl")