## 1. Importing Libraries

In [None]:
!pip install feature-engine

In [11]:
import numpy as np

import pandas as pd

import sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import learning_curve

from feature_engine.datetime import DatetimeFeatures

from xgboost import XGBRegressor

import joblib

import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'feature_engine'

## 2. Display Settings

In [None]:
pd.set_option("display.max_columns", None)

In [None]:
sklearn.set_config(transform_output="default")

## 3. Getting the Data

In [None]:
train_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/val.csv")
test_df = pd.read_csv("data/test.csv")

In [None]:
train_df

### 3.1 Split the Data

In [None]:
def split_data(data):
    X = data.drop(columns="price")
    y = data.price.copy()
    return (X, y)

In [None]:
X_train, y_train = split_data(train_df)

In [None]:
X_train

In [None]:
y_train

In [None]:
X_val, y_val = split_data(val_df)

print(X_val.shape, y_val.shape)

In [None]:
X_test, y_test = split_data(test_df)

print(X_test.shape, y_test.shape)

### 3.2 Meta-info

In [None]:
X_train.info()

## 4. Data Preprocessing

In [None]:
dt_cols = ["date_of_journey", "dep_time", "arrival_time"]

num_cols = ["duration", "total_stops"]

cat_cols = [col for col in X_train.columns if (col not in dt_cols) and (col not in num_cols)]

In [None]:
cat_cols

In [None]:
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

doj_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("extractor", DatetimeFeatures(features_to_extract=["month", "week", "day_of_week", "day_of_month"], format="mixed")),
    ("scaler", StandardScaler())
])

time_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("extractor", DatetimeFeatures(features_to_extract=["hour", "minute"], format="mixed")),
    ("scaler", StandardScaler())
])

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ("num", num_transformer, num_cols),
    ("cat", cat_transformer, cat_cols),
    ("doj", doj_transformer, ["date_of_journey"]),
    ("time", time_transformer, ["dep_time", "arrival_time"])
])

In [None]:
preprocessor.fit_transform(X_train)

In [None]:
preprocessor.fit_transform(X_train).shape

## 5. Model Selection

In [None]:
algorithms = {
    "Linear Regression": LinearRegression(),
    "Support Vector Machine": SVR(),
    "Random Forest": RandomForestRegressor(n_estimators=10),
    "XG Boost": XGBRegressor(n_estimators=10)
}

In [None]:
data = pd.concat([train_df, val_df], axis=0)

X_data, y_data = split_data(data)
print(X_data.shape, y_data.shape)

In [None]:
def plot_curves(sizes, mean_scores, std_scores, label, ax):
    ax.plot(
        sizes,
        mean_scores,
        marker="o",
        label=label
    )

    ax.fill_between(
        x=sizes,
        y1=mean_scores - std_scores,
        y2=mean_scores + std_scores,
        alpha=0.5
    )

In [None]:
def plot_learning_curves(name, algorithm, figsize=(12, 4)):
    model = Pipeline(steps=[
        ("pre", preprocessor),
        ("alg", algorithm)
    ])

    train_sizes, train_scores, test_scores = learning_curve(
        estimator=model,
        X=X_data,
        y=y_data,
        cv=3,
        scoring="r2",
        n_jobs=-1,
        random_state=42
    )
    
    mean_train_scores = np.mean(train_scores, axis=1)
    std_train_scores = np.std(train_scores, axis=1)
    train_score = f"{mean_train_scores[-1]:.2f} +/- {std_train_scores[-1]:.2f}"

    mean_test_scores = np.mean(test_scores, axis=1)
    std_test_scores = np.std(test_scores, axis=1)
    test_score = f"{mean_test_scores[-1]:.2f} +/- {std_test_scores[-1]:.2f}"

    fig, ax = plt.subplots(figsize=figsize)

    # training curve
    plot_curves(
        train_sizes,
        mean_train_scores,
        std_train_scores,
        f"Train ({train_score})",
        ax
    )

    # test curve
    plot_curves(
        train_sizes,
        mean_test_scores,
        std_test_scores,
        f"Test ({test_score})",
        ax
    )

    ax.set(xlabel="Training Set Size", ylabel="R-square", title=name)

    ax.legend(loc="lower right")

    plt.show()

In [None]:
for name, alg in algorithms.items():
    plot_learning_curves(name, alg)

## 6. Model Training

In [None]:
model = Pipeline(steps=[
    ("pre", preprocessor),
    ("rf", RandomForestRegressor(n_estimators=10))
])

In [None]:
model.fit(X_data, y_data)

## 7. Model Evaluation

In [None]:
def evaluate_model(X, y):
    y_pred = model.predict(X)
    return r2_score(y, y_pred)

In [None]:
print(f"R2 score on Training data is = {evaluate_model(X_data, y_data)}")

In [None]:
print(f"R2 score on Test data is = {evaluate_model(X_test, y_test)}")

## 8. Model Persistence

In [None]:
joblib.dump(model, "model.joblib")

In [None]:
saved_model = joblib.load("model.joblib")
saved_model

In [None]:
y_pred = saved_model.predict(X_test)

r2_score(y_test, y_pred)

Collecting feature_engineCollecting feature_engine
  Using cached feature_engine-1.8.0-py2.py3-none-any.whl (357 kB)
Collecting pandas>=2.2.0
  Using cached pandas-2.2.2-cp39-cp39-win_amd64.whl (11.6 MB)
  Using cached feature_engine-1.8.0-py2.py3-none-any.whl (357 kB)
Collecting pandas>=2.2.0
  Using cached pandas-2.2.2-cp39-cp39-win_amd64.whl (11.6 MB)
Installing collected packages: pandas, feature-engine
  Attempting uninstall: pandas
    Found existing installation: pandas 1.1.5
    Uninstalling pandas-1.1.5:
      Successfully uninstalled pandas-1.1.5
  Rolling back uninstall of pandas
  Moving to c:\users\dell\anaconda3\lib\site-packages\pandas-1.1.5.dist-info\
   from C:\Users\Dell\anaconda3\Lib\site-packages\~-ndas-1.1.5.dist-info
  Moving to c:\users\dell\anaconda3\lib\site-packages\pandas\
   from C:\Users\Dell\anaconda3\Lib\site-packages\~-ndas

Installing collected packages: pandas, feature-engine
  Attempting uninstall: pandas
    Found existing installation: pandas 1.1.5


ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\Dell\\anaconda3\\Lib\\site-packages\\pandas\\tests\\arrays\\sparse\\test_accessor.py'
Consider using the `--user` option or check the permissions.

  ERROR: Failed to restore c:\users\dell\anaconda3\lib\site-packages\pandas-1.1.5.dist-info\
  ERROR: Failed to restore c:\users\dell\anaconda3\lib\site-packages\pandas\
ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'C:\\Users\\Dell\\anaconda3\\Lib\\site-packages\\pandas\\tests\\arrays\\timedeltas\\__init__.py'

