In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from scipy.stats import randint, uniform
import joblib

In [24]:
# Load the dataset
df = pd.read_csv('maintenancedata.csv', encoding='ISO-8859-1')

In [25]:
# Print column names to verify
print("Columns in DataFrame:", df.columns)

Columns in DataFrame: Index(['Station Id', 'Date', 'Rented Bike Count', 'Hour', 'Temperature(è)',
       'Humidity(%)', 'Wind speed (m/s)', 'Visibility (10m)',
       'Dew point temperature(è)', 'Solar Radiation (MJ/m2)', 'Rainfall(mm)',
       'Snowfall (cm)', 'Seasons', 'Holiday', 'Functioning Day',
       'maintenance_needed'],
      dtype='object')


In [26]:
# Rename columns (fix encoding issues)
df.rename(columns={
    "Station Id": "Station_Id",
    "Temperature(è)": "Temperature",  
    "Dew point temperature(è)": "Dew_point_temperature",
    "Functioning Day": "Functioning_Day",
    "Humidity(%)": "Humidity",  
    "Wind speed (m/s)": "Wind_speed",
    "Visibility (10m)": "Visibility",
    "Solar Radiation (MJ/m2)": "Solar_Radiation",
    "Snowfall (cm)": "Snowfall",
    "Rainfall(mm)": "Rainfall",
    "maintenance_needed": "Maintenance_Needed"
}, inplace=True)

In [27]:
# Parse date
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

In [28]:
# Print column names to verify
print(df.columns)

Index(['Station_Id', 'Date', 'Rented Bike Count', 'Hour', 'Temperature',
       'Humidity', 'Wind_speed', 'Visibility', 'Dew_point_temperature',
       'Solar_Radiation', 'Rainfall', 'Snowfall', 'Seasons', 'Holiday',
       'Functioning_Day', 'Maintenance_Needed'],
      dtype='object')


In [29]:
# Create time-related features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek

In [30]:
# One-hot encode categorical features
df = pd.get_dummies(df, columns=['Seasons', 'Holiday', 'Functioning_Day', 'Station_Id'])  # Include 'Station_Id'

In [31]:
# Select features and target
features = ['Hour', 'Temperature', 'Humidity', 'Wind_speed', 'Visibility', 
            'Dew_point_temperature', 'Solar_Radiation', 'Rainfall', 'Snowfall',
            'Year', 'Month', 'Day', 'DayOfWeek',
            'Seasons_Autumn', 'Seasons_Spring', 'Seasons_Summer', 'Seasons_Winter',
            'Holiday_Holiday', 'Holiday_No Holiday', 'Functioning_Day_No', 'Functioning_Day_Yes']
# Add 'Station_Id' features
station_id_features = [col for col in df.columns if col.startswith('Station_Id')]
features.extend(station_id_features)

X = df[features]
y = df['Maintenance_Needed']

In [32]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [33]:
# Set up the parameter grid for hyperparameter tuning
param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(10, 30),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 4),
    'max_features': ['sqrt', 'log2', None]
}

In [34]:
# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

In [35]:
# Initialize RandomizedSearchCV with the Random Forest Classifier and parameter distributions
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=100, cv=3, n_jobs=-1, verbose=2, random_state=42)


In [36]:
# Fit RandomizedSearchCV to the training data
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [37]:
# Get the best parameters from the random search
best_params = random_search.best_params_

In [38]:
# Train a Random Forest Classifier with the best parameters
rf_best = RandomForestClassifier(**best_params, random_state=42)
rf_best.fit(X_train, y_train)

In [39]:
# Predict on the validation set
y_pred = rf_best.predict(X_val)

In [40]:
# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)

print("Best Hyperparameters: ", best_params)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)

Best Hyperparameters:  {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 187}
Accuracy:  0.5228310502283106
Precision:  0.5112570356472795
Recall:  0.6337209302325582


In [41]:
# Save the trained model
joblib.dump(rf_best, '../API/bike_maintenance_model.pkl')

['../API/bike_maintenance_model.pkl']