# **IITM DATA SCIENCE MLP Kaggle Assignment 1**

In this assignment my task is to predict the price of Flight tickets.

# **Importing Necessary Modules**

In [None]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_squared_error

# **Configs**

In [None]:
pd.set_option("display.max_columns", 200)
warnings.filterwarnings("ignore")

# **Reading data into dataframes**

In [None]:
df = pd.read_csv('/kaggle/input/mlp-term-2-2025-kaggle-assignment-1/train.csv')
test_df = pd.read_csv('/kaggle/input/mlp-term-2-2025-kaggle-assignment-1/test.csv')

# **Schema of data**

Identify data types of different columns

In [None]:
df.info()

# **Descriptive Statistics of numerical columns**

Present descriptive statistics of numerical columns

In [None]:
df.drop(columns=['id']).describe()

# **Replacing Placeholder Values containing Scientific Notation**

In [None]:
for i in (df, test_df):
    i.replace(to_replace=r'^\d+\.\d+[eE][+-]?\d+$', value=np.nan, regex=True, inplace=True)

# **Null Values count of train dataset**

In [None]:
df.isna().sum()

# **Null values of test Dataset**

In [None]:
test_df.isna().sum()

# **Filling nulls**

In [None]:
for i in (df, test_df):
    for col in i.columns:
        # categorical features with unknown
        if i[col].dtype == 'object':
            i[col].fillna("(unknown)", inplace=True)
        # numerical features with median
        else:
            i[col].fillna(i[col].median(), inplace=True)

# **Check for nulls post filling**

In [None]:
df.isna().sum()

In [None]:
test_df.isna().sum()

# **Dropping duplicate tuples**

In [None]:
df.drop_duplicates(subset=[i for i in df.columns if i != 'id'], inplace=True)

# **Outlier Detection and Removal**

We will use the IQR method to detect outliers and cap it to the IQR outlier threshholds, because it is a common practice in data cleanup tasks.

In [None]:
for i in (df, test_df):
    for col in i.columns:
        if i[col].dtype != 'object':
            Q1 = i[col].quantile(0.25)
            Q3 = i[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5*IQR
            upper_bound = Q3 + 1.5*IQR

            i[col].clip(lower=lower_bound, upper=upper_bound, inplace=True)

# **Graphical Plots for Visualization**

**Distribution of price**

In [None]:
plt.figure(figsize=(12, 7))

sns.histplot(
    df['price'],
    bins=50,           
    kde=True,          
    color='skyblue',   
    edgecolor='black', 
    linewidth=0.5
)

# Titles and labels
plt.title('Distribution of Flight Prices', fontsize=16, fontweight='bold')
plt.xlabel('Price', fontsize=13)
plt.ylabel('Count', fontsize=13)

# Grid and style
plt.grid(True, linestyle='--', alpha=0.4)
sns.despine()

plt.xlim(0, 100000)

median_price = df['price'].median()
plt.axvline(median_price, color='red', linestyle='--', linewidth=1)
plt.text(median_price + 1000, plt.ylim()[1]*0.9, f'Median: {int(median_price)}', color='red')

plt.tight_layout()
plt.show()

**Days_left vs Price Plot**

In [None]:
sns.set(style="whitegrid")

plot = sns.lmplot(
    data=df,
    x='days_left',
    y='price',
    scatter_kws={'alpha': 0.3, 's': 20},  # Lighter and smaller dots
    line_kws={'color': 'red', 'linewidth': 2},
    height=6,
    aspect=1.2
)

# Set plot labels and title
plt.title('Price vs Days Left Before Takeoff', fontsize=16, fontweight='bold')
plt.xlabel('Days Left', fontsize=12)
plt.ylabel('Price', fontsize=12)

# Limit x and y axis to reduce outlier impact (optional)
plt.xlim(0, 50)
plt.ylim(0, 100000)

import matplotlib.ticker as ticker
plot.ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{int(x):,}'))

plt.tight_layout()
plt.show()


**Duration vs Price Scatterplot**

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='duration', y='price', alpha=0.5)
plt.title('Flight Duration vs. Price')
plt.xlabel('Duration (hours)')
plt.ylabel('Price')
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

**Class vs Price Boxplot**

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='class', y='price')
plt.title('Price Distribution by Class')
plt.xlabel('Class')
plt.ylabel('Price')
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

# **Categorizing Features for Encoding + Scaling**

In [None]:
num_cols = ['duration', 'days_left']
high_cat_cols = [] # for target-encoding
low_cat_cols = [] # for one-hot-encoding
for i in df.columns:
    if df[i].dtype == 'object':
        if df[i].nunique() > 15:
            high_cat_cols.append(i)
        else:
            low_cat_cols.append(i)

**Training and Validation Split**

In [None]:
train_x, valid_x, train_y, valid_y = train_test_split(
    df.drop(columns=['id', 'price']),
    df['price'],
    test_size=0.2,
    random_state=42
)

# removing id column from test_df
test_df.drop(columns=['id'], inplace=True)

# **Encoding + Scaling Features**

* Scaling Numerical Features
* One-Hot Encoding Categorical Features with not so many unique values
* Target-Encoding Categorical Features with a lot of unique values, to prevent dimension explosion.

**Fitting Target Encoder from Training Data + Defining Other 2 Transformers**

In [None]:
target_encoder = TargetEncoder()
target_encoder.fit(train_x[high_cat_cols], train_y)

preprocessor = ColumnTransformer(transformers=[
    ('ss', StandardScaler(), num_cols),
    ('ohe', OneHotEncoder(handle_unknown='ignore'), low_cat_cols)
], remainder='passthrough')

# **Transforming features**

In [None]:
for var in ('train_x', 'valid_x', 'test_df'):
    i = globals()[var]
    
    encoded_cols = target_encoder.transform(i[high_cat_cols])
    i.drop(columns=high_cat_cols, inplace=True)
    i = i.join(encoded_cols)

    trans_df = preprocessor.fit_transform(i).toarray()
    feature_names = preprocessor.get_feature_names_out()
    i = pd.DataFrame(trans_df, columns=feature_names)
    
    globals()[var] = i

In [None]:
train_x.head()

# **Defining Models, Initializing Fitted Models & Model Metric Dictionary**

In [None]:
model_metric = {}
fitted_model = {}

models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(random_state=42),
    "Lasso Regression": Lasso(random_state=42),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(random_state=42, n_jobs=-1),
    "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
    "XGBoost Regressor": XGBRegressor(random_state=42, n_jobs=-1),
    "LightGBM Regressor": LGBMRegressor(random_state=42, n_jobs=-1, verbosity=-1),
    "K-Neighbors Regressor": KNeighborsRegressor()
}

# **Training Model and Storing the Fitted Model and its Performance Metric**

In [None]:
for model_name, model in models.items():
    print(f"---Training {model_name}---")
    model.fit(train_x, train_y)
    fitted_model[model_name] = model

    valid_pred = model.predict(valid_x)

    rmse = np.sqrt(mean_squared_error(valid_y, valid_pred))
    r2 = r2_score(valid_y, valid_pred)
    model_metric[model_name] = {"R2": r2, "RMSE": rmse}

    print(f"RMSE: {rmse:.3f}\nR2: {r2:.4f}\n")

# **Hyperparameter Tuning**

* Random Forest Regressor: r2 score 0.9779
* XGBoost Regressor: r2 score 0.9746
* LightGBM Regressor: r2 score 0.9725

# **Defining Parameter Ranges**

In [None]:

params = {
    "Random Forest Regressor": {
        'n_estimators': [50, 100],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5]
    },
    "XGBoost Regressor": {
        'n_estimators': [50, 100],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
    },
    "LightGBM Regressor": {
        'n_estimators': [50],
        'learning_rate': [0.05],
        'num_leaves': [20]
    }
}


# **Hypertuning**

In [None]:
for model_name, param in params.items():
    print(f"---Tuning {model_name}---")

    model = models[model_name]

    grid_search = GridSearchCV(model, param, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=0)
    grid_search.fit(train_x, train_y)

    tuned_model = grid_search.best_estimator_
    fitted_model[model_name] = tuned_model # replacing our old fitted model with tuned one

    valid_pred = tuned_model.predict(valid_x)
    
    rmse = np.sqrt(mean_squared_error(valid_y, valid_pred))
    r2 = r2_score(valid_y, valid_pred)
    model_metric[f"{model_name} (tuned)"] = {"R2": r2, "RMSE": rmse}

    print(f"{grid_search.best_params_}\n")

# **Comparing Model Performance**

In [None]:
performance_df = pd.DataFrame(model_metric).T.sort_values(by='RMSE')
performance_df

# **Selecting Final Model**

In [None]:
best_model = fitted_model[performance_df.index[0]]
print(best_model)

# **Final Model Training on Entire Dataset**

Combining train_x with valid_x and train_y with valid_y

In [None]:
full_x = pd.concat([train_x, valid_x], axis=0)
full_y = pd.concat([train_y, valid_y], axis=0)

# **Training**

In [None]:
best_model.fit(full_x, full_y)

# **Predicting Test Dataset**

In [None]:
test_pred = best_model.predict(test_df)
test_pred = np.clip(test_pred, 0, None)

# **Submitting Prediction**

In [None]:
submission = pd.DataFrame({
    "id": range(test_df.shape[0]),
    "price": test_pred
})
submission.to_csv("submission.csv", index=False)
submission