In [1]:
## All libraires used in feature engineerings
import numpy as np
import pandas as pd
import sklearn
import warnings
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    PowerTransformer,
    FunctionTransformer,
    OrdinalEncoder,
    StandardScaler
)
from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
    RareLabelEncoder,
    MeanEncoder,
    CountFrequencyEncoder
)
import matplotlib.pyplot as plt

  from pandas.core import (


## Display Settings

In [3]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")

## Read the Data

In [4]:
path = r"S:\Linear Regression Project\Flight Price Prediction\Data\train.csv"
train = pd.read_csv(path)


path = r"S:\Linear Regression Project\Flight Price Prediction\Data\val.csv"
val = pd.read_csv(path)


path = r"S:\Linear Regression Project\Flight Price Prediction\Data\test.csv"
test = pd.read_csv(path)

## Preprocessing Operations

In [5]:
# airline
air_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

# doj
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
    ("scaler", MinMaxScaler())
])

# source & destination
location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

def is_north(X):
    X = pd.DataFrame(X, columns=["source", "destination"])  # Ensure X is a DataFrame
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
    return (
        X
        .assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

location_transformer = FeatureUnion(transformer_list=[
    ("part1", location_pipe1),
    ("part2", FunctionTransformer(func=is_north, validate=False))
])

# dep_time & arrival_time
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
    ("scaler", MinMaxScaler())
])

def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    X = pd.DataFrame(X, columns=["dep_time", "arrival_time"])  # Ensure X is a DataFrame
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })

    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive="left"),
                 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
                 X_temp.loc[:, col].between(eve, night, inclusive="left")],
                ["morning", "afternoon", "evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

time_pipe2 = Pipeline(steps=[
    ("part", FunctionTransformer(func=part_of_day, validate=False)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
])

time_transformer = FeatureUnion(transformer_list=[
    ("part1", time_pipe1),
    ("part2", time_pipe2)
])

# duration
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma

    def fit(self, X, y=None):
        if not self.variables:
            self.variables = pd.DataFrame(X).select_dtypes(include="number").columns.to_list()

        self.reference_values_ = {
            col: (
                pd.DataFrame(X).loc[:, col]
                .quantile(self.percentiles)
                .values
                .reshape(-1, 1)
            )
            for col in self.variables
        }

        return self

    def transform(self, X):
        X = pd.DataFrame(X, columns=self.variables)  # Ensure X is a DataFrame
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects, axis=1)

def duration_category(X, short=180, med=400):
    X = pd.DataFrame(X, columns=["duration"])  # Ensure X is a DataFrame
    return (
        X
        .assign(duration_cat=np.select([X.duration.lt(short),
                                    X.duration.between(short, med, inclusive="left")],
                                    ["short", "medium"],
                                    default="long"))
        .drop(columns="duration")
    )

def is_over(X, value=1000):
    X = pd.DataFrame(X, columns=["duration"])  # Ensure X is a DataFrame
    return (
        X
        .assign(**{
            f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns="duration")
    )

duration_pipe1 = Pipeline(steps=[
    ("rbf", RBFPercentileSimilarity()),
    ("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
    ("cat", FunctionTransformer(func=duration_category, validate=False)),
    ("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
    ("part1", duration_pipe1),
    ("part2", duration_pipe2),
    ("part3", FunctionTransformer(func=is_over, validate=False)),
    ("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
    ("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union", duration_union)
])

# total_stops
def is_direct(X):
    X = pd.DataFrame(X, columns=["total_stops"])  # Ensure X is a DataFrame
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))

total_stops_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("direct", FunctionTransformer(func=is_direct, validate=False))
])

# additional_info
info_pipe1 = Pipeline(steps=[
    ("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

def have_info(X):
    X = pd.DataFrame(X, columns=["additional_info"])  # Ensure X is a DataFrame
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
    ("part1", info_pipe1),
    ("part2", FunctionTransformer(func=have_info, validate=False))
])

info_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
    ("union", info_union)
])

# column transformer
column_transformer = ColumnTransformer(transformers=[
    ("air", air_transformer, ["airline"]),
    ("doj", doj_transformer, ["date_of_journey"]),
    ("location", location_transformer, ["source", 'destination']),
    ("time", time_transformer, ["dep_time", "arrival_time"]),
    ("dur", duration_transformer, ["duration"]),
    ("stops", total_stops_transformer, ["total_stops"]),
    ("info", info_transformer, ["additional_info"])
], remainder="passthrough")

# feature selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
    estimator=estimator,
    scoring="r2",
    threshold=0.1
)

# Preprocessor
preprocessor = Pipeline(steps=[
	("ct", column_transformer),
	("selector", selector)
])


In [10]:
# Required Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.base import BaseEstimator, TransformerMixin
import optuna
from optuna.integration import OptunaSearchCV
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Function to evaluate model
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    return mse, r2

# Main function
def main(train, test, val):
    # Separate features and target
    X_train = train.drop(columns='price')
    y_train = train['price']
    X_test = test.drop(columns='price')
    y_test = test['price']
    X_val = val.drop(columns='price')
    y_val = val['price']
    
    
    # Models to evaluate
    models = {
        "Elastic Net": ElasticNet(),
        "Random Forest": RandomForestRegressor(n_jobs=-1),
        "Gradient Boosting": GradientBoostingRegressor(),
        "XGBoost": XGBRegressor(n_jobs=-1)
    }
    
    # Custom scorer
    mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
    
    # Evaluate models
    results = {}
    for name, model in models.items():
        logging.info(f"Evaluating {name}")
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('model', model)])
        cv_score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring=mse_scorer, n_jobs=-1)
        results[name] = -np.mean(cv_score)
        logging.info(f'{name}: CV MSE: {-np.mean(cv_score)}')
    
    # Select the best model
    best_model_name = min(results, key=results.get)
    best_model = models[best_model_name]
    logging.info(f"Best model: {best_model_name}")
    
    # Hyperparameter tuning for the best model using Optuna
    def objective(trial):
        if best_model_name == "Elastic Net":
            params = {
                'model__alpha': trial.suggest_loguniform('alpha', 1e-5, 1),
                'model__l1_ratio': trial.suggest_uniform('l1_ratio', 0, 1)
            }
        elif best_model_name == "Random Forest":
            params = {
                'model__n_estimators': trial.suggest_int('n_estimators', 50, 300),
                'model__max_depth': trial.suggest_int('max_depth', 3, 30),
                'model__min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
                'model__min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10)
            }
        elif best_model_name == "Gradient Boosting":
            params = {
                'model__n_estimators': trial.suggest_int('n_estimators', 50, 300),
                'model__learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1),
                'model__max_depth': trial.suggest_int('max_depth', 3, 10),
                'model__min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
                'model__min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10)
            }
        else:  # XGBoost
            params = {
                'model__n_estimators': trial.suggest_int('n_estimators', 50, 300),
                'model__learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1),
                'model__max_depth': trial.suggest_int('max_depth', 3, 10),
                'model__min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                'model__subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
                'model__colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0)
            }
        
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('model', best_model)])
        pipeline.set_params(**params)
        
        return -np.mean(cross_val_score(pipeline, X_train, y_train, cv=5, scoring=mse_scorer, n_jobs=-1))
    
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=100)
    
    best_params = study.best_params
    logging.info(f'Best parameters for {best_model_name}: {best_params}')
    
    # Train the best model with the best parameters
    best_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('model', best_model.set_params(**best_params))])
    best_pipeline.fit(X_train, y_train)
    
    # Test the best model on the test data
    test_mse, test_r2 = evaluate_model(best_pipeline, X_test, y_test)
    logging.info(f'Test MSE for {best_model_name}: {test_mse}')
    logging.info(f'Test R^2 for {best_model_name}: {test_r2}')
    
    # Validate the best model on the validation data
    val_mse, val_r2 = evaluate_model(best_pipeline, X_val, y_val)
    logging.info(f'Validation MSE for {best_model_name}: {val_mse}')
    logging.info(f'Validation R^2 for {best_model_name}: {val_r2}')
    
    return best_pipeline

    
    

In [11]:
best_model = main(train, test, val)

2024-08-07 00:46:50,491 - INFO - Evaluating Elastic Net
2024-08-07 00:46:57,013 - INFO - Elastic Net: CV MSE: 10467482.854187468
2024-08-07 00:46:57,013 - INFO - Evaluating Random Forest
2024-08-07 00:47:02,485 - INFO - Random Forest: CV MSE: 7956938.689803014
2024-08-07 00:47:02,486 - INFO - Evaluating Gradient Boosting
2024-08-07 00:47:05,664 - INFO - Gradient Boosting: CV MSE: 6699992.968437803
2024-08-07 00:47:05,664 - INFO - Evaluating XGBoost
2024-08-07 00:47:09,318 - INFO - XGBoost: CV MSE: 9330128.694275334
2024-08-07 00:47:09,318 - INFO - Best model: Gradient Boosting
[I 2024-08-07 00:47:09,320] A new study created in memory with name: no-name-ce4aa6ed-0153-4dfb-9647-21c6545e22ab
[I 2024-08-07 00:47:13,599] Trial 0 finished with value: 9733473.430982728 and parameters: {'n_estimators': 278, 'learning_rate': 0.6138063577494571, 'max_depth': 7, 'min_samples_split': 3, 'min_samples_leaf': 9}. Best is trial 0 with value: 9733473.430982728.
[I 2024-08-07 00:47:18,319] Trial 1 finis

In [12]:
import pickle
import os

# Create a directory to save the model (if it doesn't exist)
os.makedirs('models', exist_ok=True)

# Save the model using pickle
model_path = os.path.join('models', 'best_model.pkl')
with open(model_path, 'wb') as f:
    pickle.dump(best_model, f)

print(f"Model saved to {model_path}")

Model saved to models\best_model.pkl
