# **IITM DATA SCIENCE MLP Kaggle Assignment 1**

In this assignment my task is to predict the price of Flight tickets. 

# Importing Libraries and Datasets

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder,PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import VotingRegressor
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv("/kaggle/input/mlp-term-2-2025-kaggle-assignment-1/train.csv")
test = pd.read_csv("/kaggle/input/mlp-term-2-2025-kaggle-assignment-1/test.csv")

# Data Type Identification

The dataset contains the following columns with their respective data types:

- `id`: *int64* (numerical, identifier)
- `airline`: *object* (categorical)
- `flight`: *object* (categorical, may have high cardinality)
- `source`: *object* (categorical)
- `departure`: *object* (categorical/time)
- `stops`: *object* (categorical, can be converted to ordinal or integer)
- `arrival`: *object* (categorical/time)
- `destination`: *object* (categorical)
- `class`: *object* (categorical)
- `duration`: *float64* (numerical, duration of flight)
- `days_left`: *float64* (numerical, derived feature)
- `price`: *int64* (numerical, target variable)

In [3]:
train.dtypes

id               int64
airline         object
flight          object
source          object
departure       object
stops           object
arrival         object
destination     object
class           object
duration       float64
days_left      float64
price            int64
dtype: object

# Descriptive Statistics

In [4]:
train.describe()

Unnamed: 0,id,duration,days_left,price
count,40000.0,36987.0,35562.0,40000.0
mean,19999.5,12.004088,26.197936,20801.49025
std,11547.14972,7.108063,13.469232,22729.14842
min,0.0,0.83,1.0,1105.0
25%,9999.75,6.67,15.0,4687.0
50%,19999.5,11.08,26.0,7353.0
75%,29999.25,15.92,38.0,42521.0
max,39999.0,47.08,49.0,114704.0


# Missing Values Handling

Instead of dropping these rows, I decided to **impute them** to retain as much data as possible:

- **Numerical columns** (like `duration`, `days_left`) were imputed using the **median**.
- **Categorical columns** (like `airline`, `departure`, `stops`) were imputed using the **mode**.


In [5]:
train.isnull().sum()

id                0
airline        4613
flight            0
source            0
departure      4792
stops          2319
arrival           0
destination       0
class             0
duration       3013
days_left      4438
price             0
dtype: int64

# Duplicate Records Handling

In [6]:
train.duplicated().sum()

0

There are **no duplicate rows** in the dataset, so no action was required.

# import numpy as np
import pandas as pd

def mad_based_outlier_removal(df, cols, bounds=None):
    if bounds is None:
        bounds = {}
        for col in cols:
            median = df[col].median()
            mad = np.median(np.abs(df[col] - median))
            lower = median - 3 * mad
            upper = median + 3 * mad
            bounds[col] = (lower, upper)
    for col in cols:
        lower, upper = bounds[col]
        df[col] = df[col].clip(lower=lower, upper=upper)
    return df, bounds

# Identify numeric columns
numeric_cols = [col for col in train.columns if train[col].dtype != 'object']

# Calculate bounds on train, apply to both train and test
train, bounds = mad_based_outlier_removal(train, numeric_cols)
test, _ = mad_based_outlier_removal(test, numeric_cols, bounds)
Outlier Detection 

I used boxplots to visually inspect outliers in the numerical columns of the dataset.  
While some outliers are present, I decided **not to remove them** because the models I trained are primarily **tree-based models** (like Random Forest, XGBoost, etc.), which are inherently robust to outliers.

So, retaining these values does not negatively affect model performance and helps preserve the natural distribution of the data.


In [7]:
for i in (train, test):
    for col in i.columns:
        if i[col].dtype != 'object':
            Q1 = i[col].quantile(0.25)
            Q3 = i[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5*IQR
            upper_bound = Q3 + 1.5*IQR

            i[col].clip(lower=lower_bound, upper=upper_bound, inplace=True)

## Remove id column

In [8]:
train = train.drop(columns = ["id"])

In [9]:
test = test.drop(columns = ["id"])

# Data Preprocessing

In [10]:
X_train = train.drop(columns=["price"]) 

## Feature Encoding and Imputation

In [11]:
median_columns = ["duration", "days_left"]
ohe_columns = ["airline", "source", "departure", "arrival", "destination"]
ordinal_columns = ["flight", "stops", "class"]

In [12]:
column_transformer = ColumnTransformer(transformers=[
    ('median_col',  SimpleImputer(strategy='median'), median_columns),

    ('ohe_col', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'))
    ]), ohe_columns),

    ('ordinal_col', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ]), ordinal_columns)
])

X_train_transformed = column_transformer.fit_transform(X_train)
test_transformed = column_transformer.transform(test)

# Feature Scaling
Since we are using tree-based models (like Random Forest, XGBoost), feature scaling is not necessary as they are not sensitive to feature magnitudes.

# Model Building

In [13]:
X = X_train_transformed
y = train["price"]

## Train-Test Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1437)

## Linear Regression

My **Linear Regression model** achieved an **R² score of 0.90**, which **clearly crosses the required threshold of 0.80**.

In [15]:
regression_model = LinearRegression()

regression_model.fit(X_train, y_train)
y_pred = regression_model.predict(X_test)

regression_r2_score = r2_score(y_test, y_pred)
regression_r2_score

0.9077473123917048

## Ridge Regression

This model also achieved an **R² score of 0.90**, which **crosses the required threshold of 0.80**, indicating strong predictive performance.

In [16]:
ridge_model = Ridge()

ridge_model.fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)

ridge_r2_score = r2_score(y_test, y_pred)
ridge_r2_score

0.9077462454303461

## Lasso Regression
Lasso Regression was applied, and it achieved R² score of **0.90**.

In [17]:
lasso_model = Lasso()

lasso_model.fit(X_train, y_train)
y_pred = lasso_model.predict(X_test)

lasso_r2_score = r2_score(y_test, y_pred)
lasso_r2_score

0.9077376284473211

## Polynomial Regression (Degree 2)
Polynomial Regression with degree 2 was applied to capture non-linear patterns in the data. The model performed well, achieving an R² score of **0.94**

In [18]:
polynomial_model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
polynomial_model.fit(X_train, y_train)

y_pred = polynomial_model.predict(X_test)
poly_r2_score = r2_score(y_test, y_pred)
poly_r2_score

0.9446493130366147

## K-Nearest Neighbors Regression
K-Nearest Neighbors (KNN) Regression was also applied. However, it performed relatively poorly compared to Linear, Lasso, and Ridge regressions. It achieved an R² score of **0.29**

In [19]:
KNN_model = KNeighborsRegressor()
KNN_model.fit(X_train, y_train)

y_pred = KNN_model.predict(X_test)
knn_r2_score = r2_score(y_test, y_pred)
knn_r2_score

0.2906872653007082

## Decision Tree Regression
Decision Tree Regression was applied  and delivered impressive performance. It achieved an R² score of **0.96**, outperforming Linear, Lasso, Ridge, and KNN models by effectively capturing complex patterns in the data.

In [20]:
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)

y_pred = dt_model.predict(X_test)
dt_r2_score = r2_score(y_test, y_pred)
dt_r2_score

0.9634061929078703

## Random Forest Regression
The Random Forest model leveraged the power of ensemble learning by combining multiple decision trees. It captured complex patterns in the data effectively and delivered a high R² score of **0.97**, showcasing its strong predictive capability compared to other models.

In [21]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
rf_r2_score = r2_score(y_test, y_pred)
rf_r2_score

0.9792990904872252

## Gradient Boosting Regression
Gradient Boosting Regression achieved an R² score of **0.95**, slightly lower than Random Forest but still showing strong predictive performance.

In [22]:
gboost_model = GradientBoostingRegressor()
gboost_model.fit(X, y)

y_pred = gboost_model.predict(X_test)
gb_r2_score = r2_score(y_test, y_pred)
gb_r2_score

0.9552786617419292

##  XGBoost Regression
XGBoost Regression delivered the  R² score of **0.97** . Its powerful gradient boosting mechanism nailed the patterns in the data with precision.

In [23]:
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
xgb_r2_score = r2_score(y_test, y_pred)
xgb_r2_score

0.976568126433574

## LightGBM Regression
Last but not the least, LightGBM Regression achieved an impressive R² score of **0.97**. Fast, efficient, and nearly on par with XGBoost — it proved to be a solid contender in the lineup.

In [24]:
lgb_model = LGBMRegressor(random_state=42)
lgb_model.fit(X_train, y_train)

y_pred = lgb_model.predict(X_test)
lgb_r2_score = r2_score(y_test, y_pred)
lgb_r2_score

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004000 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 614
[LightGBM] [Info] Number of data points in the train set: 32000, number of used features: 35
[LightGBM] [Info] Start training from score 20762.366187


0.9730827677362757

# Hyperparameter Tuning & Model Selection

For hyperparameter tuning, I selected the top 3 performing models — **Random Forest**, **XGBoost**, and **LightGBM** — based on their high R² scores and overall performance.

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
import optuna


def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1400, 1500),
        'max_depth': trial.suggest_int('max_depth', 9, 9),
        'learning_rate': trial.suggest_float('learning_rate', 0.015, 0.07),
        'subsample': trial.suggest_float('subsample', 0.92, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.85, 0.98),
        'reg_alpha': trial.suggest_float('reg_alpha', 4.0, 7.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 1.4, 2.2),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 6),
        'gamma': trial.suggest_float('gamma', 3.0, 3.6),
        'tree_method': 'hist',     # Use hist + device=cuda for GPU (XGBoost >= 2.0)
        'device': 'cuda',
        'random_state': 42,
        'n_jobs': -1
    }
    model = XGBRegressor(**params)
    return cross_val_score(model, X, y, cv=5, scoring='r2').mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print("✅ Best R² Score:", study.best_value)
print("🎯 Best Hyperparameters:", study.best_params)

[I 2025-07-07 10:15:20,523] A new study created in memory with name: no-name-fbd7afc1-9faf-4405-9595-1895a6c84aa8
[I 2025-07-07 10:15:58,279] Trial 0 finished with value: 0.9820034944381495 and parameters: {'n_estimators': 1422, 'max_depth': 9, 'learning_rate': 0.03758834900871876, 'subsample': 0.941574158745524, 'colsample_bytree': 0.9363018574222175, 'reg_alpha': 5.826046320266316, 'reg_lambda': 1.8726629265983334, 'min_child_weight': 5, 'gamma': 3.355874619203299}. Best is trial 0 with value: 0.9820034944381495.
[I 2025-07-07 10:16:40,487] Trial 1 finished with value: 0.9819002889880437 and parameters: {'n_estimators': 1465, 'max_depth': 9, 'learning_rate': 0.04842001900344684, 'subsample': 0.9789106913258045, 'colsample_bytree': 0.9175965404271255, 'reg_alpha': 5.454157107710865, 'reg_lambda': 2.0037119393163785, 'min_child_weight': 1, 'gamma': 3.5446494016826082}. Best is trial 0 with value: 0.9820034944381495.
[I 2025-07-07 10:17:16,384] Trial 2 finished with value: 0.98194898163

## Model Selection 
After comparing the R² scores of all three tuned models — Random Forest (**0.9793**), LightGBM (**0.9789**), and XGBoost (**0.9812**) — **XGBoost** emerged as the top performer. Therefore, it was selected as the final model for training and prediction.


In [None]:
#RANDOM FOREST REGRESSION
rf_model = RandomForestRegressor(n_estimators=700,max_depth=24,min_samples_split=4,min_samples_leaf=1,max_features=None,random_state=42,n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred_val = rf_model.predict(X_test)
rf_r2_score = r2_score(y_test, y_pred_val)

#XGBOOT REGRESSION
xgb_model = XGBRegressor(
    n_estimators=1435,
    max_depth=9,
    learning_rate=0.02696991017898118,
    subsample=0.9545380004264244,
    colsample_bytree=0.8954209312950507,
    reg_alpha=5.532947360465362,
    reg_lambda=1.906818391216358,
    min_child_weight=2,
    gamma=3.0671639571913594,
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1
)


xgb_model.fit(X_train, y_train)
y_pred_val = xgb_model.predict(X_test)
xgb_r2_score = r2_score(y_test, y_pred_val)

#LIGHTGBM REGRESSION
lgb_model = LGBMRegressor(n_estimators=1000,max_depth=15,learning_rate=0.17236,subsample=0.7715,colsample_bytree=0.9248,reg_alpha=8.9567,reg_lambda=1.5674,min_child_weight=1,min_split_gain=0.4894,random_state=42,n_jobs=-1,verbose=-1)
lgb_model.fit(X_train, y_train)
y_pred_val = lgb_model.predict(X_test)
lgb_r2_score = r2_score(y_test, y_pred_val)

print(f"Tuned Random Forest R² Score: {rf_r2_score:.4f}")
print(f"Tuned XGBoost R² Score: {xgb_r2_score:.4f}")
print(f"Tuned LightGBM R² Score: {lgb_r2_score:.4f}")

# Final Model and Submission
Now, I am using 100% of the training data to fit the final model.

In [None]:
xgb_model = XGBRegressor(
    n_estimators=1431,
    max_depth=9,
    learning_rate=0.02682794029820691,
    subsample=0.9534150710821826,
    colsample_bytree=0.9042762617787571,
    reg_alpha=6.403811137420917,
    reg_lambda=2.0333285857846137,
    min_child_weight=2,
    gamma=3.2142404785425622,
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X, y)


y_pred = xgb_model.predict(test_transformed)
submission_df = pd.DataFrame({
    'id': range(10000),
    'price': y_pred
})


submission_df.to_csv('submission.csv', index=False)