# Base model

A simple base model will be built and tested.
Categorical features have been encoded ('flight' excluded), numerical features standardized.
No Hyperparameter-Tuning

In [None]:
# importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve


# For Bayesian Optimization
import time
import optuna 
from optuna.samplers import TPESampler
from sklearn.model_selection import cross_val_score

# importing plotly and enable jupyter notebooks for showing optuna visualisations 
import plotly.io as pio
pio.renderers.default = 'iframe'

In [None]:
# reading data
df = pd.read_csv('data/Clean_Dataset.csv')
df.head()

## Train-Test-Split

In [None]:
# Train-Test-Split
df_train, df_test = train_test_split(df, test_size = 0.3, random_state = 42)

print('df_train: ', df_train.shape)
print('df_test: ', df_test.shape)

# Second Train-Test-Split for val/aim data
df_test, df_val = train_test_split(df_test, test_size=0.33, random_state = 42)

print('df_test: ', df_test.shape)
print('df_val: ', df_val.shape)

# splitting train data into features and target
features_train = df_train.drop('price', axis = 1)
target_train = df_train['price']

# splitting test data into features and target
features_test = df_test.drop('price', axis = 1)
target_test = df_test['price']

# splitting val data into features and target
features_val = df_val.drop('price', axis = 1)
target_val = df_val['price']


In [None]:
# showing data types
df_train.dtypes

## Data cleaning

In [None]:
# clean data function
def clean_data(df):
    """
        Returns clean data frame
        Args: DataFrame
    """

    # dropping 'Unnamed_ 0' column
    df = df.drop('Unnamed: 0', axis = 1)

    # dropping flight numbers
    df = df.drop('flight', axis = 1)

    #changing class into binary
    df.loc[:, 'class'] = df.loc[:, 'class'].replace({'Business': 0, 'Economy': 1})

    return df

In [None]:
#applying clean_data function on train data
features_train_cleaned = clean_data(features_train)
display(features_train_cleaned.head())

#applying clean_data function on test and val data
features_test_cleaned = clean_data(features_test)
features_val_cleaned = clean_data(features_val)


## Data preperation und training

In [None]:
# getting columns easy for copy-paste ;-) 
features_train_cleaned.columns

In [None]:
# defining categorical and numerical columns
cat_cols = ['airline', 'source_city', 'departure_time', 'stops',
            'arrival_time', 'destination_city']

num_cols = ['duration', 'days_left']

In [None]:
# defining pipelines for each step
# numerical
numeric_transformer = Pipeline([('scaler', StandardScaler())])

# categorical
categorical_transformer = Pipeline([('ohe', OneHotEncoder(sparse_output = False, handle_unknown = 'ignore'))])

# combining each pipeline step into ColumnTransformer
preprocessor = ColumnTransformer([('num', numeric_transformer, num_cols),
                                  ('cat', categorical_transformer, cat_cols)], remainder = 'passthrough')

# defining final pipeline
pipeline_rf = Pipeline([('preprocessor', preprocessor),
                        ('model', RandomForestRegressor(random_state = 42))])

In [None]:
# training model
pipeline_rf.fit(features_train_cleaned, target_train)

# predicting test data
target_test_pred = pipeline_rf.predict(features_test_cleaned)

# showing metrics
print('R2: ', r2_score(target_test, target_test_pred))
print('RMSE: ', root_mean_squared_error(target_test, target_test_pred))

In [None]:
# predicting val data
target_val_pred = pipeline_rf.predict(features_val_cleaned)

# showing metrics
print('R2: ', r2_score(target_val, target_val_pred))
print('RMSE: ', root_mean_squared_error(target_val, target_val_pred))

In [None]:
# checking cross validation score
cv_results = cross_val_score(estimator=pipeline_rf,
                            X=features_train_cleaned,
                            y=target_train,
                            cv=5,
                            scoring='r2',
                            n_jobs=-1)
cv_results.mean()

## Model Interpretation

In [None]:
# recreating DataFrame back after preprocessing
features_train_preprocessed = preprocessor.fit_transform(features_train_cleaned)

ohe_col_list = preprocessor.transformers_[1][1].named_steps['ohe'].get_feature_names_out(cat_cols)
features_train_preprocessed = pd.DataFrame(features_train_preprocessed, columns = num_cols + list(ohe_col_list) + ['class'], index = features_train.index)
display(features_train_preprocessed.head())

features_test_preprocessed = preprocessor.transform(features_test_cleaned)
features_test_preprocessed = pd.DataFrame(features_test_preprocessed, columns = num_cols + list(ohe_col_list) + ['class'], index = features_test.index)
display(features_test_preprocessed.head())

features_val_preprocessed = preprocessor.transform(features_val_cleaned)
features_val_preprocessed = pd.DataFrame(features_val_preprocessed, columns = num_cols + list(ohe_col_list) + ['class'], index = features_val.index)
display(features_val_preprocessed.head())

In [None]:
# getting feature importances from model
feature_importance = pd.Series(data=pipeline_rf.steps[1][1].feature_importances_,
                               index=features_train_preprocessed.columns)
feature_importance = feature_importance.sort_values(ascending=False)

# plotting feature importances
fig, ax = plt.subplots(figsize=(10, 8))

# getting importances from features onyl above 0
mask = feature_importance > 0
feature_importance = feature_importance.loc[mask]

# sorting
feature_importance = feature_importance.sort_values()

# plotting
feature_importance.plot(kind='barh', width=0.8);



In [None]:
# Computing learning curve (watch out, it will take some time; 16 to 23 minutes)
train_sizes, train_scores, test_scores = learning_curve(estimator=RandomForestRegressor(random_state = 42), 
                                                        X=features_train_preprocessed, 
                                                        y=target_train, 
                                                        cv=5, 
                                                        scoring='r2')

train_sizes_lc = train_sizes
train_mean_lc = train_scores.mean(axis=1)
test_mean_lc = test_scores.mean(axis=1)

In [None]:
fig_lc, ax = plt.subplots(figsize=(6,4))
ax.plot(train_sizes_lc, train_mean_lc, label="train", color = 'red')
ax.plot(train_sizes_lc, test_mean_lc, label="validation", color = 'blue')

ax.set_title("Learning Curve")
ax.set_xlabel("Training Set Size")
ax.set_ylabel("R2")
ax.legend(loc="best")
fig_lc;