In [51]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    mean_absolute_error,
    explained_variance_score,
    root_mean_squared_error,
)

In [None]:
train_df = pd.read_csv("train_features.csv")
train_df.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Price,duration__Duration,doj__Date_of_Journey_month,doj__Date_of_Journey_day_of_week,doj__Date_of_Journey_day_of_month,dep_time__Dep_Time_hour,dep_time__Dep_Time_minute,arr_time__Arrival_Time_hour,arr_time__Arrival_Time_minute,addition_info__Additional_Info_1 Long layover,addition_info__Additional_Info_Change airports,addition_info__Additional_Info_In-flight meal not included,addition_info__Additional_Info_No check-in baggage included,addition_info__Additional_Info_No info,addition_info__Additional_Info_Red-eye flight
0,9.268687,9.183883,9.183883,1210,1,9.596623,0.60401,1.0,1.0,0.307692,0.956522,0.909091,0.826087,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,9.23698,9.183883,9.183883,375,1,9.687133,0.185464,1.0,0.833333,0.538462,0.565217,0.0,0.826087,0.272727,0.0,0.0,0.0,0.0,1.0,0.0
2,9.268687,8.769654,8.51367,180,0,8.421783,0.087719,0.333333,0.833333,0.192308,0.347826,0.363636,0.478261,0.363636,0.0,0.0,1.0,0.0,0.0,0.0
3,9.056543,8.29575,8.29575,75,0,8.03948,0.035088,0.666667,0.333333,0.538462,0.826087,0.545455,0.869565,0.818182,0.0,0.0,0.0,0.0,1.0,0.0
4,9.268687,9.183883,9.183883,1025,2,9.246576,0.511278,1.0,0.833333,0.538462,0.826087,0.545455,0.521739,0.636364,0.0,0.0,1.0,0.0,0.0,0.0


In [43]:
X_train = train_df.drop("Price", axis=1)
y_train = train_df["Price"]

In [None]:
test_df = pd.read_csv("test_features.csv")
test_df.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Price,duration__Duration,doj__Date_of_Journey_month,doj__Date_of_Journey_day_of_week,doj__Date_of_Journey_day_of_month,dep_time__Dep_Time_hour,dep_time__Dep_Time_minute,arr_time__Arrival_Time_hour,arr_time__Arrival_Time_minute,addition_info__Additional_Info_1 Long layover,addition_info__Additional_Info_Change airports,addition_info__Additional_Info_In-flight meal not included,addition_info__Additional_Info_No check-in baggage included,addition_info__Additional_Info_No info,addition_info__Additional_Info_Red-eye flight
0,9.056543,9.183883,9.183883,1485,2,9.432604,0.741855,1.0,0.333333,0.423077,0.782609,0.545455,0.826087,0.272727,0.0,0.0,0.0,0.0,1.0,0.0
1,8.578815,9.183883,9.183883,195,0,8.837246,0.095238,0.0,0.833333,0.307692,0.217391,0.636364,0.347826,0.909091,0.0,0.0,0.0,0.0,1.0,0.0
2,8.897346,8.769654,8.51367,170,0,8.633019,0.082707,0.333333,0.5,0.653846,0.913043,0.0,1.0,0.909091,0.0,0.0,0.0,0.0,1.0,0.0
3,9.268687,8.29575,8.29575,85,0,8.306719,0.0401,0.666667,1.0,0.423077,0.086957,1.0,0.173913,0.363636,0.0,0.0,1.0,0.0,0.0,0.0
4,9.268687,9.183883,9.183883,1100,1,9.2363,0.548872,1.0,0.833333,0.0,0.782609,0.272727,0.521739,0.636364,0.0,0.0,1.0,0.0,0.0,0.0


In [45]:
X_test = test_df.drop("Price", axis=1)
y_test = test_df["Price"]

In [None]:
models = [
    ("Linear Regression", LinearRegression()),
    ("Random Forest", RandomForestRegressor()),
    ("XGBoost", xgb.XGBRegressor()),
    ("SVM", SVR()),
    ("Decision Tree", DecisionTreeRegressor()),
    ("Gradient Boosting", GradientBoostingRegressor()),
]

In [47]:
def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

In [None]:
def test_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error: {mse}")
    rmse = root_mean_squared_error(y_test, y_pred)
    print(f"root mean squared error: {rmse}")
    r2 = r2_score(y_test, y_pred)
    print(f"R2 Score: {r2}")
    mae = mean_absolute_error(y_test, y_pred)
    print(f"Mean Absolute Error: {mae}")
    evs = explained_variance_score(y_test, y_pred)
    print(f"Explained Variance Score: {evs}")
    return

In [49]:
def train_and_test_models(models):
    for name, model in models:
        train_model(model, X_train, y_train)
        print(name)
        test_model(model, X_test, y_test)
        print(40 * "-")

In [None]:
output = train_and_test_models(models)

Linear Regression
Mean Squared Error: 0.06984738834592127
root mean squared error: 0.2642865648229612
R2 Score: 0.7159409307056117
Mean Absolute Error: 0.20139647504702793
Explained Variance Score: 0.7166794947536466
----------------------------------------


Random Forest
Mean Squared Error: 0.014868920091448508
root mean squared error: 0.12193818143407137
R2 Score: 0.9395302859188989
Mean Absolute Error: 0.06964699973802349
Explained Variance Score: 0.9396387095413796
----------------------------------------
XGBoost
Mean Squared Error: 0.013874134299902505
root mean squared error: 0.11778851514431492
R2 Score: 0.9435759336200608
Mean Absolute Error: 0.07868720046411974
Explained Variance Score: 0.9435874234848716
----------------------------------------
SVM
Mean Squared Error: 0.10181654489012797
root mean squared error: 0.31908704907928803
R2 Score: 0.5859270666352845
Mean Absolute Error: 0.2552511927246934
Explained Variance Score: 0.5859341372968758
----------------------------------------
Decision Tree
Mean Squared Error: 0.02493093140449471
root mean squared error: 0.15789531786755018
R2 Score: 0.8986095638060239
Mean Absolute Error: 0.07704985774579205
Explained Variance Score: 0.8987019770317721
------------------------------------