In [1]:
# Cell: imports & prepare data
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# load your dataframe (update filename if different)
df = pd.read_csv("Clean_Dataset.csv")  # or your filename
# If you already have df in notebook, skip reading; otherwise ensure df exists
print("Loaded df shape:", df.shape)
df.head()


Loaded df shape: (300153, 12)


Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [2]:
if "unnamed: 0" in (c.lower() for c in df.columns):
    df = df.drop([c for c in df.columns if c.lower()=="unnamed: 0"], axis=1)

# normalize column names
df.columns = [c.strip().lower() for c in df.columns]

# Ensure duration_mins exists (if your earlier processing created it, this will override safely)
import re
def duration_to_mins(x):
    if pd.isna(x): return np.nan
    s = str(x).lower().strip()
    if ":" in s:
        try:
            h,m = s.split(":")
            return int(h)*60 + int(m)
        except: pass
    h_match = re.search(r"(\d+)\s*h", s)
    m_match = re.search(r"(\d+)\s*m", s)
    if h_match or m_match:
        h = int(h_match.group(1)) if h_match else 0
        m = int(m_match.group(1)) if m_match else 0
        return h*60 + m
    digits = re.findall(r"\d+", s)
    if len(digits)==1:
        val = int(digits[0])
        return val if val>12 else val*60
    return np.nan

if "duration_mins" not in df.columns:
    if "duration" in df.columns:
        df["duration_mins"] = df["duration"].apply(duration_to_mins)
        df["duration_mins"].fillna(df["duration_mins"].median(), inplace=True)
    else:
        raise ValueError("No duration/duration_mins column found. Adjust code to match your dataset.")

# parse stops to numeric if needed
def parse_stops(x):
    if pd.isna(x): return 0
    s = str(x).lower()
    if "non" in s or "direct" in s or "non-stop" in s: return 0
    m = re.search(r"(\d+)", s)
    return int(m.group(1)) if m else 0

if "stops_num" not in df.columns:
    if "stops" in df.columns:
        df["stops_num"] = df["stops"].apply(parse_stops)
    else:
        raise ValueError("No stops/stops_num column found. Adjust code to match your dataset.")

# parse time columns if needed (create dep_hour, dep_min, arr_hour, arr_min)
def parse_time_col(col_name, hour_col, min_col):
    if hour_col in df.columns and min_col in df.columns:
        return
    dt = pd.to_datetime(df[col_name], errors="coerce")
    mask = dt.isna()
    if mask.any():
        def fix_time(val):
            if pd.isna(val): return pd.NaT
            s = str(val)
            digits = "".join(ch for ch in s if ch.isdigit())
            if len(digits)==4:
                try:
                    return pd.to_datetime(digits[:2]+":"+digits[2:], format="%H:%M", errors="coerce")
                except:
                    return pd.NaT
            return pd.NaT
        dt.loc[mask] = df.loc[mask, col_name].apply(fix_time)
    df[hour_col] = dt.dt.hour.fillna(df[hour_col].median() if hour_col in df.columns else 0).astype(int)
    df[min_col]  = dt.dt.minute.fillna(0).astype(int)

# run only if columns missing
if "dep_hour" not in df.columns or "dep_min" not in df.columns:
    parse_time_col("departure_time", "dep_hour", "dep_min")
if "arr_hour" not in df.columns or "arr_min" not in df.columns:
    parse_time_col("arrival_time", "arr_hour", "arr_min")

# normalize travel class
if "travel_class" not in df.columns:
    if "class" in df.columns:
        df["travel_class"] = df["class"].astype(str).str.strip().str.lower()
    else:
        df["travel_class"] = "economy"

# ensure days_left and price exist
if "days_left" not in df.columns or "price" not in df.columns:
    raise ValueError("Required columns missing: days_left and/or price")

# select features
features = [
    "airline", "source_city", "destination_city", "travel_class", "stops_num",
    "duration_mins", "dep_hour", "dep_min", "arr_hour", "arr_min", "days_left"
]
missing = [f for f in features if f not in df.columns]
if missing:
    raise ValueError("Missing feature columns: " + ", ".join(missing))

X = df[features].copy()
y = df["price"].copy()

print("Prepared X shape:", X.shape, "y length:", len(y))
X.head()


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["duration_mins"].fillna(df["duration_mins"].median(), inplace=True)
  dt = pd.to_datetime(df[col_name], errors="coerce")
  dt = pd.to_datetime(df[col_name], errors="coerce")


Prepared X shape: (300153, 11) y length: 300153


Unnamed: 0,airline,source_city,destination_city,travel_class,stops_num,duration_mins,dep_hour,dep_min,arr_hour,arr_min,days_left
0,SpiceJet,Delhi,Mumbai,economy,0,,0,0,0,0,1
1,SpiceJet,Delhi,Mumbai,economy,0,,0,0,0,0,1
2,AirAsia,Delhi,Mumbai,economy,0,,0,0,0,0,1
3,Vistara,Delhi,Mumbai,economy,0,,0,0,0,0,1
4,Vistara,Delhi,Mumbai,economy,0,,0,0,0,0,1


In [5]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

preds = model_small.predict(X_test)
mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))   # <- compute RMSE this way
r2 = r2_score(y_test, preds)

print(f"Evaluation -> MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.4f}")


Evaluation -> MAE: 3414.72, RMSE: 6366.13, R2: 0.9214


In [7]:
import os
import joblib

out_name = "SkyFare-Predictor.pkl"

# Compress the model to reduce file size
joblib.dump(model_small, out_name, compress=3)

# Show size after saving
size_bytes = os.path.getsize(out_name)
print(f"Saved {out_name} -> {size_bytes/1024/1024:.2f} MB")

Saved SkyFare-Predictor.pkl -> 4.14 MB
