In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train =pd.read_csv("/kaggle/input/mlp-term-2-2025-kaggle-assignment-1/train.csv")
test =pd.read_csv("/kaggle/input/mlp-term-2-2025-kaggle-assignment-1/test.csv")

In [None]:
train.shape

In [None]:
test.shape

## DATA INFORMATION

In [None]:
train.info()

In [None]:
train.select_dtypes(include="object").columns.tolist()

In [None]:
test.info()

In [None]:
test.select_dtypes(include="object").columns.tolist()

### In train and test dataset 8 columns are object type and remaining are Integer type


## DATA STATISTICS

In [None]:
train.describe()

In [None]:
test.describe()

## HANDLE MISSING VALUE

In [None]:
train.isnull().sum()


In [None]:
train.isnull().sum().sum()

In [None]:
test.isnull().sum()

In [None]:
from sklearn.impute import KNNImputer, SimpleImputer

num_cols = ['duration', 'days_left']




knn_imputer = KNNImputer(n_neighbors = 5)
train[num_cols] = knn_imputer.fit_transform(train[num_cols])
test[num_cols] = knn_imputer.transform(test[num_cols])

print(train[num_cols].isnull().sum())
print(test[num_cols].isnull().sum())

In [None]:
cat_cols = ['airline', 'departure', 'stops']
simple_imputer = SimpleImputer(strategy = 'most_frequent')

train[cat_cols] = simple_imputer.fit_transform(train[cat_cols])
test[cat_cols] = simple_imputer.transform(test[cat_cols])

print(train.isnull().sum())
print(test.isnull().sum())

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum().sum()

In [None]:
test.shape

## Identify and handle duplicates

In [None]:
duplicated = train[train.duplicated()]
print(duplicated)

In [None]:
duplicated = test[test.duplicated()]
print(duplicated)

In [None]:
print(train.duplicated().sum())
print(test.duplicated().sum())

## Identify and handle outliers

## capping for removing outlier 

In [None]:
def cap_outliers_iqr(train):
    df_capped = train.copy()
    
    for col in train.select_dtypes(include='number').columns:
        Q1 = train[col].quantile(0.25)
        Q3 = train[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        
        df_capped[col] = train[col].clip(lower=lower_bound, upper=upper_bound)
    
    return df_capped


In [None]:

train_capped = cap_outliers_iqr(train)

print("Original rows:", train.shape[0])
print("Rows after capping:", train_capped.shape[0])  # Should be the same


In [None]:

test_capped = cap_outliers_iqr(test)


print("Original rows:", test.shape[0])
print("Rows after capping:", test_capped.shape[0])  # Should be the same


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# For train data
for col in ['duration', 'days_left', 'price']:
    if col in train_capped.columns:
        sns.boxplot(x=train_capped[col])
        plt.title(f'Boxplot of {col} (Train)')
        plt.show()

# For test data
for col in ['duration', 'days_left']:
    if col in test_capped.columns:
        sns.boxplot(x=test_capped[col])
        plt.title(f'Boxplot of {col} (Test)')
        plt.show()

In [None]:
test_capped.shape

## visualizations

In [None]:

import pandas as pd
import numpy as np


import seaborn as sns
import matplotlib.pyplot as plt


**1.Number of Flights by Airline**

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(data=train_capped, y='airline', order=train_capped['airline'].value_counts().index)
plt.title("‚úàÔ∏è Number of Flights by Airline")
plt.xlabel("Flight Count")
plt.ylabel("Airline")
plt.tight_layout()
plt.show()


## NOTE
 * Vistara, Air_India, and SpiceJet appear to be the most frequent airlines.

* Some airlines (like GoAir or Trujet) have significantly fewer flights

**2 . Price Distribution by Class**

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=train_capped, x='class', y='price')
plt.title("üí∞ Price Distribution by Class")
plt.xlabel("Travel Class")
plt.ylabel("Price (INR)")
plt.tight_layout()
plt.show()


# NOTE
* Business class tickets have a significantly higher median price than economy, as expected.

* There's also more variability and more outliers in business class prices, which is natural due to premium pricing.

**3. Average Flight Price vs. Days Left Before Departure**


In [None]:


price_trend = train_capped.groupby('days_left')['price'].mean().sort_index()


plt.figure(figsize=(10, 5))
sns.lineplot(x=price_trend.index, y=price_trend.values)
plt.title("üìà Average Flight Price vs. Days Left Before Departure")
plt.xlabel("Days Left Before Departure (booking earlier ‚Üí right)")
plt.ylabel("Average Price (INR)")
plt.tight_layout()
plt.show()


In [None]:
sns.lineplot(x='days_left', y='price', data=train_capped)


## NOTE
1. Early Booking = Lower Prices (Usually)
At higher days_left (e.g., 30‚Äì60 days), you might see lower average prices.

Airlines often offer discounts to fill seats early.

2. Last-Minute Spike
As days_left approaches 0 (e.g., 0‚Äì3 days), prices often increase sharply.

This is due to demand urgency, fewer available seats, and business travelers.

3. Mid-Range Dip or Plateau
Sometimes, a small price dip appears in the middle range (e.g., 7‚Äì15 days).

Could indicate price optimization periods or deals.

In [None]:
train_capped.info()

## split data

In [None]:
from sklearn.model_selection import train_test_split


X = train_capped.drop(columns=["id",'price'])
y = train_capped['price']
X_train_r, X_val_r, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


## Scale Numerical features and Encode Categorical features

In [None]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

# Numerical and categorical features
num_cols = ['duration', 'days_left']

nom_cols = ['airline', 'flight', 'source', 'departure', 'stops', 'arrival', 'destination','class']


In [None]:

num_transformer = StandardScaler()




nom_transformer = OneHotEncoder(handle_unknown='ignore',sparse_output=False)


preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_cols),
    
    ('nom', nom_transformer, nom_cols)
],
                                 remainder="passthrough"
                                 
                                )

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])


pipeline.fit(X_train_r)


X_train = pipeline.transform(X_train_r)
X_val = pipeline.transform(X_val_r)
X_test_r = test_capped.drop(columns=['id'])
X_test_transformed = pipeline.transform(X_test_r)

feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()


X_train_df = pd.DataFrame(X_train, columns=feature_names)
X_val_df = pd.DataFrame(X_val, columns=feature_names)
X_test_df = pd.DataFrame(X_test_transformed, columns=feature_names)



### Applies OneHotEncoder to all listed categorical features. Applies StandardScaler to duration and days_left, standardizing them (zero mean, unit variance).

## Model Building

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

s = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor()
}

results = {}

for name, model in s.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = r2_score(y_val, y_pred)
    results[name] = round(score, 4)


for model, score in results.items():
    print(f"{model}: R¬≤ = {score}")

In [None]:
best_model_name = max(results, key=results.get)
best_r2 = results[best_model_name]

In [None]:
print(best_model_name)
print(best_r2)

## Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor


model = RandomForestRegressor(random_state=42)


param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}

# Grid search with 3-fold CV
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

y_pred_r = grid_search.predict(X_val)
r2_rf = r2_score(y_val, y_pred_r)
print(f" Random Forest Best R¬≤: {r2_rf:.4f}")

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score

from scipy.stats import uniform, randint

model = XGBRegressor(random_state=42)

param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'gamma': uniform(0, 0.5),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(1, 3)
}

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=30,
    scoring='r2',
    cv=3,
    random_state=42,
    verbose=1,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("Best Params:", random_search.best_params_)
print("Best CV R¬≤ Score:", random_search.best_score_)


y_pred = random_search.predict(X_val)
r2_xb = r2_score(y_val, y_pred)
print("Test R¬≤ Score:", r2_xb)


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

dt_param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['squared_error', 'friedman_mse'] 
}

dt_grid = GridSearchCV(
    estimator=DecisionTreeRegressor(random_state=42),
    param_grid=dt_param_grid,
    scoring='r2',
    cv=3,
    n_jobs=-1,
    verbose=1
)

dt_grid.fit(X_train, y_train)


y_pred_dt = dt_grid.predict(X_val)
r2_dt = r2_score(y_val, y_pred_dt)

print(f" Best Decision Tree R¬≤ on Validation Set: {r2_dt:.4f}")
print(" Best Hyperparameters:", dt_grid.best_params_)



## comparission of best model

In [None]:
model_scores = {
    'Decision Tree': r2_dt,
    'Random Forest': r2_rf,
    'XGBoostR': r2_xb
    
}


best_model_name = max(model_scores, key=model_scores.get)
print(f" Best Model Based on Validation R¬≤: {best_model_name}")

In [None]:
best_model = {
    'Decision Tree': dt_grid.best_estimator_,
    'Random Forest': grid_search.best_estimator_,
    'XGBoostR': random_search.best_estimator_
}[best_model_name]

In [None]:

X_full_r = pd.concat([X_train_r, X_val_r])
y_full = pd.concat([y_train, y_val])


X_full_transformed = pipeline.transform(X_full_r)


best_model.fit(X_full_transformed, y_full)


X_val_transformed = pipeline.transform(X_val_r)
y_val_pred_final = best_model.predict(X_val_transformed)
final_r2 = r2_score(y_val, y_val_pred_final)
print(f" Final R¬≤ on Validation (retraining on full train data): {final_r2:.4f}")


X_test_transformed = pipeline.transform(X_test_r)


y_test_pred = best_model.predict(X_test_transformed)

## submission

In [None]:
submission = pd.DataFrame({
    'id': test_capped['id'],
    'price': y_test_pred
})

submission.to_csv('submission.csv', index=False)
print("submission.csv file created successfully.")