## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [6]:
!pip install tensorflow
!pip install dill
!pip install xgboost
!pip install lightgbm





In [99]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import warnings

#### Import the CSV Data as Pandas DataFrame

In [100]:
import pandas as pd

# Hardcoding the file names
file_names = [
    'flights_cleaned.csv'
]

# Reading each file into a DataFrame
dataframes = [pd.read_csv(file_name) for file_name in file_names]

# Combining all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

combined_df

Unnamed: 0,MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN,CRS_DEP_TIME,DEP_DELAY_NEW
0,8,1,AA,LGA,1730,0.0
1,8,1,AA,JFK,845,0.0
2,8,1,AA,JFK,1555,0.0
3,8,1,AA,LGA,1130,0.0
4,8,1,AA,LGA,1430,8.0
...,...,...,...,...,...,...
14655,8,4,UA,LGA,1005,6.0
14656,8,4,UA,LGA,1824,8.0
14657,8,4,UA,LGA,1711,28.0
14658,8,4,UA,LGA,1405,0.0


#### Show Top 5 Records

#### Preparing X and Y variables

In [101]:
# Set negative values to zero
combined_df['DEP_DELAY_NEW'] = combined_df['DEP_DELAY_NEW'].apply(lambda x: 0 if x < 0 else x)

# Remove rows where the value is greater than 180
combined_df = combined_df[combined_df['DEP_DELAY_NEW'] <= 180]

# Filter the rows where ORIGIN is one of 'JFK', 'EWR', 'LGA', 'HPN'
filtered_combined_df = combined_df[combined_df['ORIGIN'].isin(['JFK', 'EWR', 'LGA', 'HPN'])]

#combined_df.to_csv('flights_cleaned.csv', index=False)

# Update the original DataFrame
X = filtered_combined_df.copy()

In [102]:
X.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN,CRS_DEP_TIME,DEP_DELAY_NEW
0,8,1,AA,LGA,1730,0.0
1,8,1,AA,JFK,845,0.0
2,8,1,AA,JFK,1555,0.0
3,8,1,AA,LGA,1130,0.0
4,8,1,AA,LGA,1430,8.0


In [103]:
categorical_columns = ["MONTH", "DAY_OF_WEEK", 
                                   "OP_UNIQUE_CARRIER", "ORIGIN"]

print("Categories in 'MONTH' variable: ", end=" ")
print(combined_df['MONTH'].unique())

print("Categories in 'DAY_OF_WEEK' variable: ", end=" ")
print(combined_df['DAY_OF_WEEK'].unique())

print("Categories in 'OP_UNIQUE_CARRIER' variable: ", end=" ")
print(combined_df['OP_UNIQUE_CARRIER'].unique())

print("Categories in 'ORIGIN' variable: ", end=" ")
print(combined_df['ORIGIN'].unique())

Categories in 'MONTH' variable:  [ 8  9 10 11 12  5  2  3  4  6  7]
Categories in 'DAY_OF_WEEK' variable:  [1 2 3 4 5 6 7]
Categories in 'OP_UNIQUE_CARRIER' variable:  ['AA' 'OO' 'B6' '9E' 'DL' 'YX' 'UA' 'MQ']
Categories in 'ORIGIN' variable:  ['LGA' 'JFK' 'HPN']


In [104]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14660 entries, 0 to 14659
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   MONTH              14660 non-null  int64  
 1   DAY_OF_WEEK        14660 non-null  int64  
 2   OP_UNIQUE_CARRIER  14660 non-null  object 
 3   ORIGIN             14660 non-null  object 
 4   CRS_DEP_TIME       14660 non-null  int64  
 5   DEP_DELAY_NEW      14660 non-null  float64
dtypes: float64(1), int64(3), object(2)
memory usage: 687.3+ KB


In [105]:
X = X.drop(columns=['DEP_DELAY_NEW'])
X

Unnamed: 0,MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN,CRS_DEP_TIME
0,8,1,AA,LGA,1730
1,8,1,AA,JFK,845
2,8,1,AA,JFK,1555
3,8,1,AA,LGA,1130
4,8,1,AA,LGA,1430
...,...,...,...,...,...
14655,8,4,UA,LGA,1005
14656,8,4,UA,LGA,1824
14657,8,4,UA,LGA,1711
14658,8,4,UA,LGA,1405


In [106]:
y = combined_df['DEP_DELAY_NEW']

In [107]:
y.describe()

count    14660.000000
mean        11.442906
std         28.537503
min          0.000000
25%          0.000000
50%          0.000000
75%          4.000000
max        180.000000
Name: DEP_DELAY_NEW, dtype: float64

#### Create an Evaluate Function to give all metrics after model Training

In [108]:
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return mae, rmse

In [119]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR  # Import SVR

# Assuming X and y are your features and target variable

cat_features = ['OP_UNIQUE_CARRIER', 'ORIGIN']
num_features = ['MONTH', 'DAY_OF_WEEK', 'CRS_DEP_TIME']

oh_transformer = OneHotEncoder()
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),        
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
preprocessor.fit(X_train)

X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Assuming X_train, X_test, y_train, y_test are already defined
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "XGBRegressor": XGBRegressor(),
    "LightGBM Regressor": LGBMRegressor(),
    "SVR": SVR(),  # Add the SVR model
    "Random Forest Regressor": RandomForestRegressor(),
}

model_list = []
mae_list = []

# Train and evaluate each model
# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train_transformed, y_train)  # Use X_train_transformed

    y_train_pred = model.predict(X_train_transformed)  # Use X_train_transformed
    y_test_pred = model.predict(X_test_transformed)  # Use X_test_transformed

    model_train_mae, model_train_rmse = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse = evaluate_model(y_test, y_test_pred)

    print(name)
    model_list.append(name)

    print('Model performance for Training set')
    print("- MAE: {:.4f}".format(model_train_mae))
    print("- RMSE: {:.4f}".format(model_train_rmse))

    print('Model performance for Test set')
    print("- MAE: {:.4f}".format(model_test_mae))
    print("- RMSE: {:.4f}".format(model_test_rmse))

    mae_list.append(model_test_mae)
    print('=' * 35)


# Determine the best model
best_model_name = min(zip(model_list, mae_list), key=lambda x: x[1])[0]
print(f"Best Model: {best_model_name}")

# Save the best performing model
best_model = models[best_model_name]
with open('model1.pkl', 'wb') as file:
    pickle.dump(best_model, file)

# Create and print the sorted DataFrame of model performance
mae_dataframe = pd.DataFrame(list(zip(model_list, mae_list)), columns=['Model Name', 'MAE']).sort_values(by="MAE", ascending=True)
print(mae_dataframe)


Linear Regression
Model performance for Training set
- MAE: 16.7199
- RMSE: 28.2200
Model performance for Test set
- MAE: 16.7610
- RMSE: 27.6818
Decision Tree
Model performance for Training set
- MAE: 10.8303
- RMSE: 21.8052
Model performance for Test set
- MAE: 17.9750
- RMSE: 33.7624
XGBRegressor
Model performance for Training set
- MAE: 14.0992
- RMSE: 24.5696
Model performance for Test set
- MAE: 16.8983
- RMSE: 28.9607
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000093 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 228
[LightGBM] [Info] Number of data points in the train set: 11728, number of used features: 14
[LightGBM] [Info] Start training from score 11.446965
LightGBM Regressor
Model performance for Training set
- MAE: 15.3896
- RMSE: 26.4430
Model performance for Test set
- MAE: 16.2521
- RMSE: 27.5328
Random Forest Regr

In [152]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define your evaluate_model function here
# def evaluate_model(true_values, predictions):
#    mae = mean_absolute_error(true_values, predictions)
#    rmse = np.sqrt(mean_squared_error(true_values, predictions))
#    return mae, rmse

# Assuming X and y are your features and target variable
cat_features = ['OP_UNIQUE_CARRIER', 'ORIGIN']
num_features = ['MONTH', 'DAY_OF_WEEK', 'CRS_DEP_TIME']

oh_transformer = OneHotEncoder()
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),        
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
preprocessor.fit(X_train)

X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Define LightGBM parameter grid
lgbm_param_grid = {
    'learning_rate': [0.01, 0.1, 0.5],
    'n_estimators': [50, 100, 200],
    'num_leaves': [31, 50, 100]

}

# Create a LightGBM regressor
lgbm = LGBMRegressor()

# Apply grid search with cross-validation
lgbm_grid_search = GridSearchCV(lgbm, lgbm_param_grid, cv=2, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1)
lgbm_grid_search.fit(X_train_transformed, y_train)

# Get the best LightGBM model
best_lgbm = lgbm_grid_search.best_estimator_

# Models including the tuned LightGBM
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "XGBRegressor": XGBRegressor(),
    "LightGBM Regressor (Tuned)": best_lgbm,  # Include the tuned LightGBM
    "Random Forest Regressor": RandomForestRegressor(),
}

model_list = []
mae_list = []

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train_transformed, y_train)  # Use X_train_transformed

    y_train_pred = model.predict(X_train_transformed)  # Use X_train_transformed
    y_test_pred = model.predict(X_test_transformed)  # Use X_test_transformed

    model_train_mae, model_train_rmse = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse = evaluate_model(y_test, y_test_pred)

    print(name)
    model_list.append(name)

    print('Model performance for Training set')
    print("- MAE: {:.4f}".format(model_train_mae))
    print("- RMSE: {:.4f}".format(model_train_rmse))

    print('Model performance for Test set')
    print("- MAE: {:.4f}".format(model_test_mae))
    print("- RMSE: {:.4f}".format(model_test_rmse))

    mae_list.append(model_test_mae)
    print('=' * 35)

# Determine the best model
best_model_name = min(zip(model_list, mae_list), key=lambda x: x[1])[0]
print(f"Best Model: {best_model_name}")

# Save the best performing model
best_model = models[best_model_name]
with open('model1.pkl', 'wb') as file:
    pickle.dump(best_model, file)

# Create and print the sorted DataFrame of model performance
mae_dataframe = pd.DataFrame(list(zip(model_list, mae_list)), columns=['Model Name', 'MAE']).sort_values(by="MAE", ascending=True)
print(mae_dataframe)

Fitting 2 folds for each of 27 candidates, totalling 54 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 228
[LightGBM] [Info] Number of data points in the train set: 11728, number of used features: 14
[LightGBM] [Info] Start training from score 11.446965
Linear Regression
Model performance for Training set
- MAE: 16.7199
- RMSE: 28.2200
Model performance for Test set
- MAE: 16.7610
- RMSE: 27.6818
Decision Tree
Model performance for Training set
- MAE: 10.8303
- RMSE: 21.8052
Model performance for Test set
- MAE: 17.9974
- RMSE: 33.7848
XGBRegressor
Model performance for Training set
- MAE: 14.0992
- RMSE: 24.5696
Model performance for Test set
- MAE: 16.8983
- RMSE: 28.9607
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000073 seconds.


In [153]:
# Assuming best_model is your chosen model
with open('model1.pkl', 'wb') as file:
    pickle.dump(best_model, file)

with open('preprocessor.pkl', 'wb') as preprocessor_file:
    pickle.dump(preprocessor, preprocessor_file)


### Results

In [154]:
with open('model1.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

with open('preprocessor.pkl', 'rb') as preprocessor_file:
    loaded_preprocessor = pickle.load(preprocessor_file)

example_data = pd.DataFrame({
    'MONTH': [1],
    'DAY_OF_WEEK': [6],
    'OP_UNIQUE_CARRIER': ['AA'],
    'ORIGIN': ['JFK'],
    'CRS_DEP_TIME': [1920]
})

X_example = loaded_preprocessor.transform(example_data)

prediction = loaded_model.predict(X_example)
print("Predicted DEP_DELAY_NEW:", prediction[0])

Predicted DEP_DELAY_NEW: 9.250535543963919


## Linear Regression

## Plot y_pred and y_test

#### Difference between Actual and Predicted Values

In [155]:
# Assuming y_pred is the predicted values from your model
# Flatten the predicted values to 1D if they are in 2D format
y_pred = best_model.predict(X_test_transformed)

if y_pred.ndim > 1:
    y_pred_flat = y_pred.flatten()
else:
    y_pred_flat = y_pred

# Create a DataFrame with actual and predicted values
pred_df = pd.DataFrame({
    'Actual Value': y_test,
    'Predicted Value': y_pred_flat,
    'Difference': y_test - y_pred_flat
})

print(pred_df)


       Actual Value  Predicted Value  Difference
3207           28.0         3.771447   24.228553
4256            0.0         1.913814   -1.913814
10727          14.0        12.567128    1.432872
6849            3.0         4.409082   -1.409082
2495            0.0         8.143784   -8.143784
...             ...              ...         ...
7200            0.0         6.418989   -6.418989
13628           0.0         8.626230   -8.626230
4848            0.0         4.332069   -4.332069
9490            0.0        10.167105  -10.167105
3705            0.0         5.665502   -5.665502

[2932 rows x 3 columns]
