In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
train=pd.read_csv("C://Users//pedda//Downloads//train_JDXlpm8//train//train.csv")
test=pd.read_csv("C://Users//pedda//Downloads//test_8gqdJqH.csv")
transactions=pd.read_csv("C://Users//pedda//Downloads//train_JDXlpm8//train//transactions.csv")

In [5]:
for df in [train, test, transactions]:
    df['doj'] = pd.to_datetime(df['doj'])
transactions['doi'] = pd.to_datetime(transactions['doi'])


In [7]:
trans_15 = transactions[transactions['dbd'] == 15]


In [9]:
merge_cols = ['doj', 'srcid', 'destid']
train = train.merge(trans_15, on=merge_cols, how='left')
test = test.merge(trans_15, on=merge_cols, how='left')


In [11]:
def engineer_features(df):
    df['doj_dayofweek'] = df['doj'].dt.dayofweek
    df['doj_month'] = df['doj'].dt.month
    df['doj_is_weekend'] = df['doj_dayofweek'].isin([5, 6]).astype(int)
    df['seat_to_search'] = df['cumsum_seatcount'] / (df['cumsum_searchcount'] + 1)
    df['log_seatcount'] = np.log1p(df['cumsum_seatcount'])
    df['log_searchcount'] = np.log1p(df['cumsum_searchcount'])
    df['tier_diff'] = abs(df['srcid_tier'].astype('category').cat.codes - df['destid_tier'].astype('category').cat.codes)
    df['same_region'] = (df['srcid_region'] == df['destid_region']).astype(int)
    return df

train = engineer_features(train)
test = engineer_features(test)


In [13]:
features = [
    'cumsum_seatcount', 'cumsum_searchcount', 'seat_to_search',
    'log_seatcount', 'log_searchcount',
    'doj_dayofweek', 'doj_month', 'doj_is_weekend',
    'tier_diff', 'same_region'
]

X = train[features]
y = train['final_seatcount']


In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)


In [27]:
param_grid = {
    'n_estimators': [200, 300, 500],
    'max_depth': [30],
    'min_samples_split': [10],
    'min_samples_leaf': [4],
    'max_features': ['sqrt']
}

random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42, n_jobs=-1),
    param_distributions=param_grid,
    n_iter=10,
    cv=3,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    random_state=42
)

random_search.fit(X_train, y_train)
best_params = random_search.best_params_
print("Best Params:", best_params)




Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best Params: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 30}


In [28]:
final_model = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
final_model.fit(X, y)  # Train on full training data


In [29]:
y_val_pred = final_model.predict(X_val)
val_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"Validation RMSE: {val_rmse:.2f}")


Validation RMSE: 638.34




In [30]:
y_train_pred=final_model.predict(X_train)
train_rmse=mean_squared_error(y_train, y_train_pred, squared=False)
print(f'Validation train RMSE: {train_rmse}')

Validation train RMSE: 538.2215173683633




In [35]:
# Predict on test set
test['final_seatcount'] = final_model.predict(test[features])
test['final_seatcount'] = test['final_seatcount'].round()

# Save submission
submission = test[['route_key', 'final_seatcount']]
submission.to_csv("submission_f1.csv", index=False)

print("Submission file created: submission_file.csv")


Submission file created: submission_file.csv
