In [19]:
# ml_prediction_pipeline.py

import pandas as pd
import numpy as np
import joblib
import os

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor

# =========================
# 1. Load Dataset
# =========================
df = pd.read_excel(r"C:\Users\naray\OneDrive\Pictures\Desktop\01. My Learning\new\smart-travel-advisor\data\raw_docs\Flight_Price_Data\Data_Train.xlsx")
df.dropna(inplace=True)

# =========================
# 2. Feature Engineering
# =========================
def convert_duration(x):
    hrs = 0
    mins = 0
    if 'h' in x:
        hrs = int(x.split('h')[0].strip())
        x = x.split('h')[1]
    if 'm' in x:
        mins = int(x.split('m')[0].strip())
    return hrs * 60 + mins

df['Journey_day'] = pd.to_datetime(df['Date_of_Journey']).dt.day
df['Journey_month'] = pd.to_datetime(df['Date_of_Journey']).dt.month
df['Dep_hour'] = pd.to_datetime(df['Dep_Time']).dt.hour
df['Dep_min'] = pd.to_datetime(df['Dep_Time']).dt.minute
df['Arrival_hour'] = pd.to_datetime(df['Arrival_Time']).dt.hour
df['Arrival_min'] = pd.to_datetime(df['Arrival_Time']).dt.minute
df['Duration_mins'] = df['Duration'].apply(convert_duration)

df.drop(columns=['Route', 'Additional_Info', 'Date_of_Journey', 'Dep_Time', 'Arrival_Time', 'Duration'], inplace=True)

# =========================
# 3. Label Encoding
# =========================
encoder_dir = r"C:\Users\naray\OneDrive\Pictures\Desktop\01. My Learning\new\smart-travel-advisor\models\encoders"
os.makedirs(encoder_dir, exist_ok=True)

categorical_cols = ['Airline', 'Source', 'Destination', 'Total_Stops']
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    values = df[col].fillna("Unknown").unique().tolist()
    if "Unknown" not in values:
        values.append("Unknown")
    le.fit(values)

    df[col] = df[col].fillna("Unknown").apply(lambda x: x if x in values else "Unknown")
    df[col] = le.transform(df[col])

    joblib.dump(le, os.path.join(encoder_dir, f"{col}_encoder.pkl"))
    encoders[col] = le

# =========================
# 4. Split Data
# =========================
X = df.drop('Price', axis=1)
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =========================
# 5. Hyperparameter Tuning - XGBoost
# =========================
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_search = RandomizedSearchCV(
    xgb,
    param_distributions=xgb_param_grid,
    n_iter=15,
    cv=3,
    verbose=1,
    n_jobs=-1,
    scoring='r2',
    random_state=42
)
xgb_search.fit(X_train, y_train)
xgb_best = xgb_search.best_estimator_

# Evaluate XGBoost
xgb_pred = xgb_best.predict(X_test)
xgb_mae = mean_absolute_error(y_test, xgb_pred)
xgb_r2 = r2_score(y_test, xgb_pred)
print(f"XGBoost ➤ MAE: {xgb_mae:.2f}, R2: {xgb_r2:.2f}")

# =========================
# 6. Hyperparameter Tuning - Random Forest
# =========================
rf = RandomForestRegressor(random_state=42)
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

rf_search = RandomizedSearchCV(
    rf,
    param_distributions=rf_param_grid,
    n_iter=15,
    cv=3,
    verbose=1,
    n_jobs=-1,
    scoring='r2',
    random_state=42
)
rf_search.fit(X_train, y_train)
rf_best = rf_search.best_estimator_

# Evaluate Random Forest
rf_pred = rf_best.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)
print(f"Random Forest ➤ MAE: {rf_mae:.2f}, R2: {rf_r2:.2f}")

# =========================
# 7. Select and Save Best Model
# =========================
model_dir = r"C:\Users\naray\OneDrive\Pictures\Desktop\01. My Learning\new\smart-travel-advisor\models"
os.makedirs(model_dir, exist_ok=True)

if xgb_r2 > rf_r2:
    best_model = xgb_best
    model_name = "flight_price_model_xgb.joblib"
    print("✅ XGBoost selected as best model")
else:
    best_model = rf_best
    model_name = "flight_price_model_rf.joblib"
    print("✅ Random Forest selected as best model")

joblib.dump(best_model, os.path.join(model_dir, model_name))


  df['Journey_day'] = pd.to_datetime(df['Date_of_Journey']).dt.day
  df['Journey_month'] = pd.to_datetime(df['Date_of_Journey']).dt.month
  df['Dep_hour'] = pd.to_datetime(df['Dep_Time']).dt.hour
  df['Dep_min'] = pd.to_datetime(df['Dep_Time']).dt.minute
  df['Arrival_hour'] = pd.to_datetime(df['Arrival_Time']).dt.hour
  df['Arrival_min'] = pd.to_datetime(df['Arrival_Time']).dt.minute


Fitting 3 folds for each of 15 candidates, totalling 45 fits
XGBoost ➤ MAE: 1200.59, R2: 0.85
Fitting 3 folds for each of 15 candidates, totalling 45 fits


18 fits failed out of a total of 45.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
14 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\naray\OneDrive\Pictures\Desktop\01. My Learning\new\smart-travel-advisor\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\naray\OneDrive\Pictures\Desktop\01. My Learning\new\smart-travel-advisor\venv\lib\site-packages\sklearn\base.py", line 1356, in wrapper
    estimator._validate_params()
  File "c:\Users\naray\OneDrive\Pictures\Desktop\01. My Learning\new\smart-travel-advisor\venv\lib\site-packages\sklearn\base.py", line 469, in _validate_params
    val

Random Forest ➤ MAE: 1199.23, R2: 0.82
✅ XGBoost selected as best model


['C:\\Users\\naray\\OneDrive\\Pictures\\Desktop\\01. My Learning\\new\\smart-travel-advisor\\models\\flight_price_model_xgb.joblib']

In [15]:
import pandas as pd
import joblib
import os

# =========================
# 1. Load Test Set
# =========================
test_df = pd.read_excel(r"C:\Users\naray\OneDrive\Pictures\Desktop\01. My Learning\new\smart-travel-advisor\data\raw_docs\Flight_Price_Data\Test_set.xlsx")

# =========================
# 2. Feature Engineering
# =========================
def convert_duration(x):
    hrs = 0
    mins = 0
    if 'h' in x:
        hrs = int(x.split('h')[0].strip())
        x = x.split('h')[1]
    if 'm' in x:
        mins = int(x.split('m')[0].strip())
    return hrs * 60 + mins

# Use dayfirst=True to parse Indian-style dates
test_df['Journey_day'] = pd.to_datetime(test_df['Date_of_Journey'], dayfirst=True).dt.day
test_df['Journey_month'] = pd.to_datetime(test_df['Date_of_Journey'], dayfirst=True).dt.month
test_df['Dep_hour'] = pd.to_datetime(test_df['Dep_Time']).dt.hour
test_df['Dep_min'] = pd.to_datetime(test_df['Dep_Time']).dt.minute
test_df['Arrival_hour'] = pd.to_datetime(test_df['Arrival_Time']).dt.hour
test_df['Arrival_min'] = pd.to_datetime(test_df['Arrival_Time']).dt.minute
test_df['Duration_mins'] = test_df['Duration'].apply(convert_duration)

# Drop unused columns
test_df.drop(columns=['Route', 'Additional_Info', 'Date_of_Journey', 'Dep_Time', 'Arrival_Time', 'Duration'], inplace=True)

# =========================
# 3. Load Encoders & Encode with Unknown Handling
# =========================
def safe_transform(encoder, val):
    return encoder.transform([val if val in encoder.classes_ else "Unknown"])[0]

categorical_cols = ['Airline', 'Source', 'Destination', 'Total_Stops']

for col in categorical_cols:
    encoder = joblib.load(f"encoders/{col}_encoder.pkl")
    test_df[col] = test_df[col].fillna("Unknown").apply(lambda x: safe_transform(encoder, x))

# =========================
# 4. Ensure Column Order Matches Training Data
# =========================
columns_used_for_training = [
    'Airline', 'Source', 'Destination', 'Total_Stops',
    'Journey_day', 'Journey_month', 'Dep_hour', 'Dep_min',
    'Arrival_hour', 'Arrival_min', 'Duration_mins'
]

test_df = test_df[columns_used_for_training]

# =========================
# 5. Load Trained Model & Predict
# =========================
model = joblib.load("models/flight_price_model_rf.joblib")
predictions = model.predict(test_df)

# =========================
# 6. Format Submission File
# =========================
submission = pd.read_excel(r"C:\Users\naray\OneDrive\Pictures\Desktop\01. My Learning\new\smart-travel-advisor\notebooks\Final_submission.xlsx")
submission['new_Price'] = predictions

submission.to_excel("Final_submission.xlsx", index=False)
print("✅ Final_submission.xlsx has been saved successfully.")


  test_df['Dep_hour'] = pd.to_datetime(test_df['Dep_Time']).dt.hour
  test_df['Dep_min'] = pd.to_datetime(test_df['Dep_Time']).dt.minute
  test_df['Arrival_hour'] = pd.to_datetime(test_df['Arrival_Time']).dt.hour
  test_df['Arrival_min'] = pd.to_datetime(test_df['Arrival_Time']).dt.minute


✅ Final_submission.xlsx has been saved successfully.
