<a href="https://colab.research.google.com/github/Shrutakeerti/Forecasting-and-Scheduling-of-Railway-Racks/blob/main/Forecasting_and_Scheduling_of_railway_racks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd

# Load the dataset
file_path = '/content/train_revised.csv'
data = pd.read_csv(file_path)

# Display the first few rows to understand the structure
data.head()


Unnamed: 0,ride_id,seat_number,payment_method,payment_receipt,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity
0,1442,15A,Mpesa,UZUEHCBUSO,17-10-17,7:15,Migori,Nairobi,Bus,49
1,5437,14A,Mpesa,TIHLBUSGTE,19-11-17,7:12,Migori,Nairobi,Bus,49
2,5710,8B,Mpesa,EQX8Q5G19O,26-11-17,7:05,Keroka,Nairobi,Bus,49
3,5777,19A,Mpesa,SGP18CL0ME,27-11-17,7:10,Homa Bay,Nairobi,Bus,49
4,5778,11A,Mpesa,BM97HFRGL9,27-11-17,7:12,Migori,Nairobi,Bus,49


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

# Convert travel_date to datetime format and extract day, month, year, and weekday
data['travel_date'] = pd.to_datetime(data['travel_date'], format='%d-%m-%y')
data['day'] = data['travel_date'].dt.day
data['month'] = data['travel_date'].dt.month
data['year'] = data['travel_date'].dt.year
data['weekday'] = data['travel_date'].dt.weekday

# Encode categorical features like `travel_from`, `travel_to`, and `car_type`
label_encoders = {}
for column in ['travel_from', 'travel_to', 'car_type', 'payment_method']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le  # Store encoder for potential inverse transforms

# Extract travel time hour and minute
data['travel_hour'] = pd.to_datetime(data['travel_time'], format='%H:%M').dt.hour
data['travel_minute'] = pd.to_datetime(data['travel_time'], format='%H:%M').dt.minute

# Drop unnecessary columns for prediction
data = data.drop(columns=['ride_id', 'seat_number', 'payment_receipt', 'travel_time', 'travel_date'])

# Split data into features (X) and target (y) - we'll assume demand prediction as the target
X = data.drop(columns=['max_capacity'])
y = data['max_capacity']  # Predicting demand

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.head(), y_train.head()


(       payment_method  travel_from  travel_to  car_type  day  month  year  \
 7023                1            7          0         1   27      2  2018   
 30732               1            7          0         0   17     12  2017   
 7374                1            7          0         1   26      2  2018   
 13062               1            9          0         0    4      1  2018   
 23732               1            7          0         1   12      1  2018   
 
        weekday  travel_hour  travel_minute  
 7023         1           10              0  
 30732        6            7              7  
 7374         0            8              0  
 13062        3            7             13  
 23732        4            8             40  ,
 7023     11
 30732    49
 7374     11
 13062    49
 23732    11
 Name: max_capacity, dtype: int64)

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the dataset
file_path = '/content/train_revised.csv'
data = pd.read_csv(file_path)

# Convert travel_date to datetime format and extract day, month, year, and weekday
data['travel_date'] = pd.to_datetime(data['travel_date'], format='%d-%m-%y')
data['day'] = data['travel_date'].dt.day
data['month'] = data['travel_date'].dt.month
data['year'] = data['travel_date'].dt.year
data['weekday'] = data['travel_date'].dt.weekday

# Encode categorical features like `travel_from`, `travel_to`, and `car_type`
label_encoders = {}
for column in ['travel_from', 'travel_to', 'car_type', 'payment_method']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Extract travel time hour and minute
data['travel_hour'] = pd.to_datetime(data['travel_time'], format='%H:%M').dt.hour
data['travel_minute'] = pd.to_datetime(data['travel_time'], format='%H:%M').dt.minute

# Drop unnecessary columns for prediction
data = data.drop(columns=['ride_id', 'seat_number', 'payment_receipt', 'travel_time', 'travel_date'])

# Split data into features (X) and target (y)
X = data.drop(columns=['max_capacity'])
y = data['max_capacity']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost regressor model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict on test set and evaluate
y_pred = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}, Root Mean Squared Error: {rmse}, R^2 Score: {r2}")


Mean Squared Error: 2.4374821937473997e-07, Root Mean Squared Error: 0.0004937086381406953, R^2 Score: 1.0


In [8]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Initialize the XGBoost regressor model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict on test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

mse, rmse, r2


(2.4374821937473997e-07, 0.0004937086381406953, 1.0)

In [9]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'learning_rate': [0.01, 0.1, 0.2],  # Step size
    'max_depth': [3, 5, 7],  # Depth of trees
    'subsample': [0.8, 1],  # Proportion of samples for each tree
    'colsample_bytree': [0.8, 1],  # Proportion of features for each tree
}

# Initialize XGBoost model
xgb_model = XGBRegressor(random_state=42)

# Grid Search with Cross-Validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and model performance
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validated R^2 Score:", best_score)


Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
Best Cross-Validated R^2 Score: 1.0


In [10]:
import joblib
joblib.dump(grid_search.best_estimator_, 'xgboost_model.pkl')


['xgboost_model.pkl']