In [1]:
import pandas as pd
df=pd.read_csv(r"C:\Users\rujha\OneDrive\Desktop\skyhack\test_Ritvik Kumar_Rujhan N Sharma.csv")

In [12]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Defining Target (y) and Features (X)
target_column = 'departure_delay_minutes'

# These are all numerical features that should be known before departure
feature_columns = [
    'total_seats', 
    'scheduled_ground_time_minutes', 
    'minimum_turn_minutes',
    'TOTAL_BAGS', 
    'TRANSFER_BAGGAGE', 
    'ORIGIN_BAGGAGE',
    'TOTAL_PASSENGERS', 
    'TOTAL_LAP_CHILD', 
    'TOTAL_BASIC_ECONOMY',
    'TOTAL_STROLLER_USERS', 
    'AVG_ADVANCE_BOOKING', 
    'LAST_MOMENT_BOOKINGS',
    'PEOPLE_OPTED_FOR_AIRPORT_WHEELCHAIR',
    'PEOPLE_OPTED_FOR_UNACCOMPANIED_MINOR',
    'PEOPLE_OPTED_FOR_MANUAL_WHEELCHAIR',
    'PEOPLE_OPTED_FOR_ELECTRIC_WHEELCHAIR'
]

# Creating and Cleaning X and y
# Filling any missing values with 0. 
# A simple way to handle NaNs for tree models.
X = df[feature_columns].fillna(0)
y = df[target_column].fillna(0)

# Creating the Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42  # 'random_state' is a seed for a random number generator
)

# Training Random Forest 
print("--- Random Forest ---")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("R2 score:", r2_score(y_test, y_pred_rf))
print("MSE:", mean_squared_error(y_test, y_pred_rf))


# Training XGBoost 
print("\n--- XGBoost ---")
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print("R2 score:", r2_score(y_test, y_pred_xgb))
print("MSE:", mean_squared_error(y_test, y_pred_xgb))

--- Random Forest ---
R2 score: 0.3559230837287658
MSE: 2958.3524547200786

--- XGBoost ---
R2 score: 0.3069514036178589
MSE: 3183.28759765625


In [None]:
# Extracting new time-based features
df['scheduled_departure_datetime_local']=pd.to_datetime(df['scheduled_departure_datetime_local'], dayfirst=True)
df['departure_hour'] = df['scheduled_departure_datetime_local'].dt.hour
df['departure_day_of_week'] = df['scheduled_departure_datetime_local'].dt.dayofweek # 0=Monday, 6=Sunday
df['departure_month'] = df['scheduled_departure_datetime_local'].dt.month

# One-Hot Encoding Categorical Features
# This will create new columns 
categorical_cols = ['scheduled_departure_station_code']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True, dummy_na=True)

# Defining the NEW Feature List (X) ---
# Starting with the original numerical features
feature_columns = [
    'total_seats', 
    'scheduled_ground_time_minutes', 
    'minimum_turn_minutes',
    'TOTAL_BAGS', 
    'TRANSFER_BAGGAGE', 
    'ORIGIN_BAGGAGE',
    'TOTAL_PASSENGERS', 
    'TOTAL_LAP_CHILD', 
    'TOTAL_BASIC_ECONOMY',
    'TOTAL_STROLLER_USERS', 
    'AVG_ADVANCE_BOOKING', 
    'LAST_MOMENT_BOOKINGS',
    'PEOPLE_OPTED_FOR_AIRPORT_WHEELCHAIR',
    'PEOPLE_OPTED_FOR_UNACCOMPANIED_MINOR',
    'PEOPLE_OPTED_FOR_MANUAL_WHEELCHAIR',
    'PEOPLE_OPTED_FOR_ELECTRIC_WHEELCHAIR'
]

# Add our new engineered features
new_time_features = ['departure_hour', 'departure_day_of_week', 'departure_month']

# finding all the new columns created during one-hot encoding
encoded_station_columns = [col for col in df_encoded.columns if 'scheduled_departure_station_code_' in col]  # list comprehension

# Combine all features into one big list
all_features = feature_columns + new_time_features + encoded_station_columns

# Creating the final X matrix from the encoded DataFrame
# Selects all the columns from 'df_encoded' DataFrame whose names are in that 'all_features' list.
X = df_encoded[all_features].fillna(0)

# Create the Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train Random Forest ---
print("--- Random Forest (with Feature Engineering) ---")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1) # n_jobs=-1 speeds it up
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("R2 score:", r2_score(y_test, y_pred_rf))
print("MSE:", mean_squared_error(y_test, y_pred_rf))

# Train XGBoost  ---
print("\n--- XGBoost (with Feature Engineering) ---")
xgb_model = XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print("R2 score:", r2_score(y_test, y_pred_xgb))
print("MSE:", mean_squared_error(y_test, y_pred_xgb))

--- Random Forest (with Feature Engineering) ---
R2 score: 0.40188657325942057
MSE: 2747.2344987037036

--- XGBoost (with Feature Engineering) ---
R2 score: 0.3677566647529602
MSE: 2903.9990234375


In [4]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error

print("\n--- KNN (with Feature Engineering) ---")

# Creating the model
knn_model=KNeighborsRegressor(n_neighbors=5, n_jobs=-1)

# Training the model
knn_model.fit(X_train, y_train)

# Making predictions
y_pred_knn= knn_model.predict(X_test)

# Evaluation
print("R2 score:", r2_score(y_test, y_pred_knn))
print("MSE:", mean_squared_error(y_test, y_pred_knn))



--- KNN (with Feature Engineering) ---
R2 score: 0.24655691182145045
MSE: 3460.6894814814814


In [5]:
from sklearn.linear_model import LinearRegression

# Training Linear Regression (with Feature Engineering) for comparison
# To prove that our data is complex and non-linear
print("\n--- Linear Regression (with Feature Engineering) ---")

# 1. Create the model
lr_model = LinearRegression()

# 2. Train the model
lr_model.fit(X_train, y_train)

# 3. Make predictions
y_pred_lr = lr_model.predict(X_test)

# 4. Evaluate the results
print("R2 score:", r2_score(y_test, y_pred_lr))
print("MSE:", mean_squared_error(y_test, y_pred_lr)) 


--- Linear Regression (with Feature Engineering) ---
R2 score: 0.05470331635292969
MSE: 4341.904971064648


In [None]:
# +++ Cross-Validation +++
from sklearn.model_selection import cross_val_score

rf_model_cv = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Running the 5-fold cross-validation
# cv=5 means 5 folds
# scoring=r2 specifies we want the R2 score
scores = cross_val_score(rf_model_cv, X, y, cv=5, scoring='r2')

# 3. Print the results
print("\n--- Random Forest Cross-Validation ---")
print("R2 Scores for each of the 5 folds:", scores)
print("Average R2 Score (Mean):", scores.mean())
print("Std Deviation of R2 Scores:", scores.std())


--- Random Forest Cross-Validation ---
R2 Scores for each of the 5 folds: [-0.14550944 -1.29976153  0.35224027  0.1984921  -0.12780573]
Average R2 Score (Mean): -0.20446886433350536
Std Deviation of R2 Scores: 0.5798831061754305


In [14]:
# Creating the X matrix (WITHOUT encoded columns)
all_features_stable = feature_columns + new_time_features
X_stable = df_encoded[all_features_stable].fillna(0)

# Running Cross-Validation on the model 
print("\n--- CV (No One-Hot Encoding) ---")
rf_model_cv_stable = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Run CV on the X_stable data
scores_stable = cross_val_score(rf_model_cv_stable, X_stable, y, cv=5, scoring='r2')

# 3. Print the results
print("R2 Scores (5 folds):", scores_stable)
print("Average R2 Score (Mean):", np.mean(scores_stable))
print("Std Deviation of R2 Scores:", np.std(scores_stable))


--- CV (No One-Hot Encoding) ---
R2 Scores (5 folds): [-0.15021178 -1.29466913  0.35445954  0.19454833 -0.12583343]
Average R2 Score (Mean): -0.20434129261256279
Std Deviation of R2 Scores: 0.577802660762618


In [None]:
from sklearn.ensemble import RandomForestRegressor

print("--- 1. Finding Feature Importances ---")

# Training a Model on ALL Stable Features ---
# We fit on the full X_stable and y to get the best importance ranking
rf_feature_finder = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_feature_finder.fit(X_stable, y)

# Displaying the Feature Importances 
print("\n--- Ranked Feature Importances ---")

# Creating a pandas Series to view the features and their scores
importances = pd.Series( # It combines the scores and the names into a single, labeled data structure
    rf_feature_finder.feature_importances_, 
    index=X_stable.columns # gets the names of the features
).sort_values(ascending=False)

# Printing the ranked list
print(importances)

# --- Creating the New 'X_top_features' DataFrame ---
# Selecting the top 10 features from the list
top_10_features = importances.head(10).index.tolist()
X_top_features = X_stable[top_10_features]

print(f"\n--- 2. Created 'X_top_features' with: {top_10_features} ---")

--- 1. Finding Feature Importances ---

--- Ranked Feature Importances ---
scheduled_ground_time_minutes           0.329414
TOTAL_PASSENGERS                        0.100748
departure_hour                          0.069387
TRANSFER_BAGGAGE                        0.062462
AVG_ADVANCE_BOOKING                     0.061881
departure_day_of_week                   0.057807
TOTAL_BAGS                              0.054386
LAST_MOMENT_BOOKINGS                    0.052823
ORIGIN_BAGGAGE                          0.047784
TOTAL_BASIC_ECONOMY                     0.037900
PEOPLE_OPTED_FOR_AIRPORT_WHEELCHAIR     0.027590
minimum_turn_minutes                    0.027197
TOTAL_STROLLER_USERS                    0.022405
total_seats                             0.019193
TOTAL_LAP_CHILD                         0.011662
PEOPLE_OPTED_FOR_MANUAL_WHEELCHAIR      0.008632
PEOPLE_OPTED_FOR_UNACCOMPANIED_MINOR    0.006766
PEOPLE_OPTED_FOR_ELECTRIC_WHEELCHAIR    0.001963
departure_month                         0.0

In [None]:
# ++++++++ Training the model with the new set of important features only +++++++++++

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error


print(f"--- Training Random Forest on TOP 10 Features ---")
print(f"Features: {top_10_features}")
print("-" * 50)

# Creating a new Train/Test Split 
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_top_features, y, test_size=0.2, random_state=42
)

# Training Random Forest (Top 10 Features) 
rf_model_s = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model_s.fit(X_train_s, y_train_s)
y_pred_rf_s = rf_model_s.predict(X_test_s)

# Evaluating the Results
print(f"R2 score: {r2_score(y_test_s, y_pred_rf_s):.4f}")
print(f"MSE: {mean_squared_error(y_test_s, y_pred_rf_s):.4f}")

--- Training Random Forest on TOP 10 Features ---
Features: ['scheduled_ground_time_minutes', 'TOTAL_PASSENGERS', 'departure_hour', 'TRANSFER_BAGGAGE', 'AVG_ADVANCE_BOOKING', 'departure_day_of_week', 'TOTAL_BAGS', 'LAST_MOMENT_BOOKINGS', 'ORIGIN_BAGGAGE', 'TOTAL_BASIC_ECONOMY']
--------------------------------------------------
R2 score: 0.3901
MSE: 2801.2966


In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np

print("--- (Checking for Stability) CV on Top 10 Features ---")

# Creating a fresh model instance
rf_model_cv_simple = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Run CV on the new 'X_top_features'
scores_simple = cross_val_score(rf_model_cv_simple, X_top_features, y, cv=5, scoring='r2')

# 3. Print the results
print("R2 Scores (5 folds):", scores_simple)
print("Average R2 Score (Mean):", np.mean(scores_simple))
print("Std Deviation of R2 Scores:", np.std(scores_simple))

--- (Checking for Stability) CV on Top 10 Features ---
R2 Scores (5 folds): [-0.15955042 -1.36531639  0.31027904  0.19155475 -0.16320183]
Average R2 Score (Mean): -0.23724697003577674
Std Deviation of R2 Scores: 0.5945949386356321
