In [1]:
import os
import zipfile
import pandas as pd
import joblib
from dateutil import parser
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import re

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# 1. Data Preparation

In [2]:
# time categories
def get_time_of_day(dt):
    hour = dt.hour
    if hour >=5 and hour < 8:
        return 'Early Morning'
    elif hour >= 8 and hour < 11:
        return 'Morning'
    elif hour >= 11 and hour < 14:
        return 'Midday'
    elif hour >= 14 and hour < 17:
        return 'Afternoon'
    elif hour >= 17 and hour < 20:
        return 'Evening'
    elif hour >= 20 and hour < 23:
        return 'Night'
    else:
        return 'Late Night'
 
# Remove UTC offset from datetime strings
def remove_utc_offset(datetime_str):
    dt = parser.parse(datetime_str)
    return dt.strftime('%Y-%m-%dT%H:%M:%S.%f')

# function to get features
def getfeatures(df):

    df['departuretime'] = df['segmentsDepartureTimeRaw'].apply(remove_utc_offset) 
    df['departuretime'] = pd.to_datetime(df['departuretime'], utc=False)
    
    # time category
    df['time_category'] = df['departuretime'].apply(get_time_of_day)
    
    # departure date
    df['date'] = (df['departuretime'] - pd.Timedelta(hours=2)).dt.date

    # no. of days from flight
    df['days_from_flight'] = (df['date'] - pd.to_datetime(df['searchDate']).dt.date)

    return df

def datefeatures(df):
    # Extract the year
    df['year'] = pd.to_datetime(df['date']).dt.year

    # Extract the month
    df['month'] = pd.to_datetime(df['date']).dt.month

    # Extract the day of the week (Monday=0, Sunday=6)
    df['day_of_week'] = pd.to_datetime(df['date']).dt.dayofweek

    # Extract the day of the month
    df['day_of_month'] = pd.to_datetime(df['date']).dt.day

    return df

def extract_days(duration_string):
    days = re.search(r'(\d+) days', duration_string)
    return int(days.group(1)) if days else None

In [3]:
# Root directory containing subfolders 
root_dir = '../raw/itineraries_csv'

dataframes = []

# Loop through each subfolder
for directory in os.listdir(root_dir):

  if os.path.isdir(os.path.join(root_dir, directory)) and not directory.startswith('.'):
    print(directory)
    folder_path = os.path.join(root_dir, directory)  

    # Loop through zip files
    for filename in os.listdir(folder_path):
      
      if filename.endswith('.zip'):
      
        # Full path of zip file
        zip_path = os.path.join(folder_path, filename)  
        
        # Extract zip contents
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
          zip_ref.extractall(folder_path)
              
          # Parse CSV file in zip 
          csv_path = os.path.join(folder_path, [x for x in zip_ref.namelist() if x.endswith('.csv')][0])      
          df = pd.read_csv(csv_path)
          
          # Apply processing 
          df = df[df['isNonStop'] == True]  
          df = getfeatures(df)
          df = datefeatures(df)

          # median fare
          df = df.groupby(['segmentsDepartureAirportCode', 'segmentsArrivalAirportCode', 'segmentsCabinCode', 'date', 'time_category', 'days_from_flight', 'year', 'month', 'day_of_week', 'day_of_month'])['totalFare'].mode().reset_index(name='modefare')
          
          # Append to list
          dataframes.append(df) 
        
# Combine dataframes
combined_df = pd.concat(dataframes, ignore_index=True)

combined_df.head()

FileNotFoundError: [WinError 3] The system cannot find the path specified: '../raw/itineraries_csv'

In [None]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1649244 entries, 0 to 1649243
Data columns (total 11 columns):
 #   Column                        Non-Null Count    Dtype         
---  ------                        --------------    -----         
 0   segmentsDepartureAirportCode  1649244 non-null  object        
 1   segmentsArrivalAirportCode    1649244 non-null  object        
 2   segmentsCabinCode             1649244 non-null  object        
 3   date                          1649244 non-null  datetime64[ns]
 4   time_category                 1649244 non-null  object        
 5   days_from_flight              1649244 non-null  int64         
 6   year                          1649244 non-null  int32         
 7   month                         1649244 non-null  int32         
 8   day_of_week                   1649244 non-null  int32         
 9   day_of_month                  1649244 non-null  int32         
 10  medianfare                    1649244 non-null  float64       
dty

In [None]:
combined_df['days_from_flight'] = combined_df['days_from_flight'].astype('str')
combined_df['days_from_flight'] = combined_df['days_from_flight'].apply(extract_days)

In [None]:
len(combined_df)

1649244

In [None]:
combined_df.to_csv('modefare.csv', index=False)

In [None]:
combined_df = pd.read_csv('modefare.csv')

In [None]:
print('Earliest date:', combined_df['date'].min())
print('Latest date:', combined_df['date'].max())

Earliest date: 2022-04-16 00:00:00
Latest date: 2022-07-18 00:00:00


In [None]:
# Define the split date
split_date = pd.Timestamp('2022-06-17')
combined_df['date'] = pd.to_datetime(combined_df['date'])

# Create the train and test sets
train = combined_df.loc[combined_df.date < split_date]
test = combined_df.loc[combined_df.date >= split_date]

print(len(train))
print(len(test))

1321239
328005


In [None]:
train.head()

Unnamed: 0,segmentsDepartureAirportCode,segmentsArrivalAirportCode,segmentsCabinCode,date,time_category,days_from_flight,year,month,day_of_week,day_of_month,medianfare
0,ATL,BOS,coach,2022-04-17,Afternoon,1,2022,4,6,17,398.6
1,ATL,BOS,coach,2022-04-17,Early Morning,1,2022,4,6,17,248.6
2,ATL,BOS,coach,2022-04-17,Evening,1,2022,4,6,17,398.6
3,ATL,BOS,coach,2022-04-17,Midday,1,2022,4,6,17,248.6
4,ATL,BOS,coach,2022-04-17,Morning,1,2022,4,6,17,248.6


# Modelling

In [None]:
def fitmodel(model, model_name, train=train, test=test):

    X_train = train.drop(columns='modefare')
    y_train = train['modefare']

    X_test = test.drop(columns='modefare')
    y_test = test['modefare']

    # Define categorical and numeric columns for preprocessing
    categorical_cols = ['segmentsDepartureAirportCode', 'segmentsArrivalAirportCode', 'segmentsCabinCode', 'time_category']
    numeric_cols = ['year',  'month', 'day_of_week', 'day_of_month', 'days_from_flight']

    # Preprocessing and modeling pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numeric_cols), 
            ('cat', OneHotEncoder(), categorical_cols)
        ])

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = pipeline.predict(X_test)

    # Assuming you have a scikit-learn pipeline named 'pipeline'
    joblib.dump(pipeline, f'{model_name}.pkl')

    # Evaluate the model
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    return pipeline, model_name, rmse, r2, mae, mape

In [None]:
cat = fitmodel(CatBoostRegressor(), 'mode')

Learning rate set to 0.127437
0:	learn: 178.8832767	total: 83.7ms	remaining: 1m 23s
1:	learn: 170.1752199	total: 167ms	remaining: 1m 23s
2:	learn: 163.2071162	total: 252ms	remaining: 1m 23s
3:	learn: 157.5796433	total: 338ms	remaining: 1m 24s
4:	learn: 153.1336637	total: 421ms	remaining: 1m 23s
5:	learn: 149.5677644	total: 509ms	remaining: 1m 24s
6:	learn: 146.8039718	total: 588ms	remaining: 1m 23s
7:	learn: 144.7083838	total: 675ms	remaining: 1m 23s
8:	learn: 142.4865049	total: 758ms	remaining: 1m 23s
9:	learn: 140.9928122	total: 838ms	remaining: 1m 22s
10:	learn: 139.5119752	total: 923ms	remaining: 1m 22s
11:	learn: 138.3585977	total: 1.01s	remaining: 1m 23s
12:	learn: 136.9163539	total: 1.09s	remaining: 1m 23s
13:	learn: 136.0611264	total: 1.18s	remaining: 1m 22s
14:	learn: 134.8188465	total: 1.26s	remaining: 1m 22s
15:	learn: 134.0995754	total: 1.36s	remaining: 1m 23s
16:	learn: 132.9722879	total: 1.46s	remaining: 1m 24s
17:	learn: 132.1353302	total: 1.57s	remaining: 1m 25s
18:	lea

In [None]:
print(cat)

(Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', 'passthrough',
                                                  ['year', 'month',
                                                   'day_of_week',
                                                   'day_of_month',
                                                   'days_from_flight']),
                                                 ('cat', OneHotEncoder(),
                                                  ['segmentsDepartureAirportCode',
                                                   'segmentsArrivalAirportCode',
                                                   'segmentsCabinCode',
                                                   'time_category'])])),
                ('regressor',
                 <catboost.core.CatBoostRegressor object at 0x000001EE8DD07A90>)]), 'catboost', 89.71553864260115, 0.6883966201259524, 62.62714866452494, 0.2113440630923253)


In [None]:
xgboost = fitmodel(XGBRegressor(), 'mode2')

print(xgboost)

  if is_sparse(data):


(Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', 'passthrough',
                                                  ['year', 'month',
                                                   'day_of_week',
                                                   'day_of_month',
                                                   'days_from_flight']),
                                                 ('cat', OneHotEncoder(),
                                                  ['segmentsDepartureAirportCode',
                                                   'segmentsArrivalAirportCode',
                                                   'segmentsCabinCode',
                                                   'time_category'])])),
                ('regressor',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              cols...
                              feature_types=None, gamma=None, grow_policy=None,
            