In [2]:
import math

import pandas as pd
from Flight import Flight

In [11]:
from sklearn.model_selection import train_test_split


def split(flight_price_df):
    flight_price_df = pd.DataFrame(flight_price_df)
    x = flight_price_df.drop(columns=['price'])
    y = flight_price_df['price'].values.ravel()
    x_train, x_test, y_train, y_test = train_test_split(
        x,
        y,
        shuffle=True,
        test_size=0.2
    )
    return x_train, x_test, y_train, y_test

In [25]:
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('../assets/files/Flight_Price_Dataset.csv')
scaler = StandardScaler()
df['days_left'] = scaler.fit_transform(df['days_left'].values.reshape(-1,1))
df['duration'] = scaler.fit_transform(df['duration'].values.reshape(-1,1))
df = pd.get_dummies(df, columns=['departure_time', 'arrival_time', 'stops', 'class'], drop_first=True)
X_train, X_test, y_train, y_test = split(df)
type(df['days_left'][0])

numpy.float64

In [16]:
X_train

Unnamed: 0,duration,days_left,departure_time_Early_Morning,departure_time_Evening,departure_time_Late_Night,departure_time_Morning,departure_time_Night,arrival_time_Early_Morning,arrival_time_Evening,arrival_time_Late_Night,arrival_time_Morning,arrival_time_Night,stops_two_or_more,stops_zero,class_Economy
193875,-0.343801,1.253820,0,0,0,1,0,0,1,0,0,0,0,0,0
183192,-1.038677,-0.958732,0,0,0,0,0,0,1,0,0,0,0,0,1
34379,-0.389663,-0.368718,0,0,0,0,1,1,0,0,0,0,0,0,1
32844,-0.424407,-1.106235,0,0,0,0,1,1,0,0,0,0,0,0,1
157130,0.513675,0.663806,0,1,0,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249525,-0.193708,-1.770001,0,0,0,1,0,0,0,0,0,1,0,0,0
25402,-0.899702,1.475075,0,0,0,1,0,0,1,0,0,0,0,0,1
224915,-0.100595,-0.737477,0,0,0,1,0,0,0,0,0,1,0,0,0
115277,-0.228452,-1.474994,0,1,0,0,0,1,0,0,0,0,0,0,1


In [17]:
y_train

array([49177,  4202,  4149, ..., 69244,  3987,  2410], dtype=int64)

In [18]:
from tqdm import tqdm
import numpy as np
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


def compute_cost(X, y, w, b):
        m = X.shape[0]
        err = (X @ w + b - y) ** 2
        cost = np.sum(err) / (2 * m)
        return cost


def compute_gradient(X, y, w, b):
    m = X.shape[0]
    err = X @ w + b - y
    dj_dw = (X.T @ err) / m
    dj_db = np.sum(err) / m

    return dj_db, dj_dw


def gradient_descent(X, y, w_in, b_in, alpha, num_iters):
    w_history = []
    J_history = []
    w = w_in.copy()
    b = b_in



    converged = False
    for i in tqdm(range(num_iters)):
        if converged:
            break

        dj_db, dj_dw = compute_gradient(X, y, w, b)

        w -= alpha * dj_dw
        b -= alpha * dj_db

        if i < 100000:
            J_history.append(compute_cost(X, y, w, b))

        w_history.append(w)

        if len(w_history) >= 200:
            for j in range(len(w_history)-1, len(w_history)-100, -1):
                if np.allclose(w_history[-1], w_history[j], atol=1e-2):
                    continue
                else:
                    break
            else:
                converged = True


    return w, b, J_history

def checking_error(y_true, y_pred):
    # MAE
    mae = mean_absolute_error(y_true, y_pred)
    # MSE
    mse = mean_squared_error(y_true, y_pred)
    # RMSE
    rsme = math.sqrt(mse)
    # R2
    r2 = r2_score(y_true, y_pred)

    print(f'MAE: {mae}, MSE: {mse}, RMSE: {rsme}, R2: {r2}')

In [19]:
import time

import matplotlib
# matplotlib.use('TkAgg')  # You can replace 'TkAgg' with the backend of your choice, e.g., 'Qt5Agg', 'Agg', etc.
import matplotlib.pyplot as plt
import numpy as np

from globals import _flight_price_dataset
import pandas as pd


m,n = X_train.shape
initial_w = np.zeros((n,))
initial_b = 0.
# some gradient descent settings
iterations = m
alpha = 0.0003
# run gradient descent
start = time.time()
w_final, b_final, J_hist = gradient_descent(X_train, y_train, initial_w, initial_b, alpha, iterations)
end = time.time()
print(f"b,w found by gradient descent: {b_final:0.2f},{w_final} ")
m, _ = X_train.shape
pre = []
for i in range(m):
    print(f"prediction: {np.dot(X_train[i], w_final) + b_final:0.2f}, target value: {y_train[i]}")
    pre.append(np.dot(X_train[i], w_final) + b_final)

print(end - start)

checking_error(y_train, pre)


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
fig.tight_layout()

ax1.plot(J_hist)
ax1.set_title("Cost vs. iteration")
ax1.set_ylabel('Cost')
ax1.set_xlabel('iteration step')

plt.show()



fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
fig.tight_layout()

ax1.plot(J_hist)
ax1.set_title("Cost vs. iteration")
ax1.set_ylabel('Cost')
ax1.set_xlabel('iteration step')

plt.show()


  0%|          | 200/216110 [00:07<2:07:02, 28.33it/s]


b,w found by gradient descent: 1197.61,duration                        270.211812
days_left                      -121.593566
departure_time_Early_Morning    261.614516
departure_time_Evening          263.163696
departure_time_Late_Night         2.329916
departure_time_Morning          292.279161
departure_time_Night            212.046953
arrival_time_Early_Morning       43.246855
arrival_time_Evening            345.709308
arrival_time_Late_Night          29.677872
arrival_time_Morning            265.672008
arrival_time_Night              378.213392
stops_two_or_more                34.485289
stops_zero                       62.257162
class_Economy                   232.039013
dtype: float64 


KeyError: 0