## Importing Packages

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import xgboost as xgb

## Load the Data

In [3]:
merged = pd.read_csv('02_Master_Analysis_Data_With_Temp_10_31_12_07.csv')
merged.drop(columns='Unnamed: 0', inplace=True)

## Preprocessing

In [4]:
merged_df = merged
selected_stations = ['bcycle_boulder_3894', 'bcycle_boulder_2132', 'bcycle_boulder_2771','bcycle_boulder_4657','bcycle_boulder_1855','bcycle_boulder_2760','bcycle_boulder_2767','bcycle_boulder_4091','bcycle_boulder_2144','bcycle_boulder_2756']
df= merged_df.copy()
df= df[df['station_id'].isin(selected_stations)]
df['all_bikes_avl_flag'] = (df['station_capacity'] == df['bikes_available']).astype(int)
df['all_docks_avl_flag'] = (df['station_capacity'] == df['docks_available']).astype(int)

# Select month_rnd to split train data and remove it befor modelling
req_cols= ['month_rnd','station_id','cu_class_status','docks_available','bikes_available','all_bikes_avl_flag','all_docks_avl_flag','day_of_week_rnd','hour_rnd','temperature_2m','precipitation_probability','rain','snowfall','snow_depth','visibility','bike_wait_time','dock_wait_time']
df= df[req_cols]

# Function to assign categories based on bike_wait_time
def categorize_wait_time(time):
    if time < 15:
        return "Very Low"
    elif 15 <= time < 30:
        return "Low"
    elif 30 <= time < 60:
        return "High"
    else:
        return "Very High"

# Apply the function to create a new column 'wait_time_category'
df['wait_time'] = df['bike_wait_time'].apply(categorize_wait_time)

# Display the updated DataFrame
display(df)

#One hot encoding
columns_to_encode = ['station_id','cu_class_status','day_of_week_rnd']
df = pd.get_dummies(df,columns = columns_to_encode)

#split the dec data for testing
train_data= df[df['month_rnd'] < 12]
test_data= df[df['month_rnd'] == 12]
print(train_data.shape,test_data.shape)


Unnamed: 0,month_rnd,station_id,cu_class_status,docks_available,bikes_available,all_bikes_avl_flag,all_docks_avl_flag,day_of_week_rnd,hour_rnd,temperature_2m,precipitation_probability,rain,snowfall,snow_depth,visibility,bike_wait_time,dock_wait_time,wait_time
0,10,bcycle_boulder_1855,Regular,15,0,0,1,Tuesday,0,1.8935,0,0.0,0,0.03,48700,84.0,366.0,Very High
19,10,bcycle_boulder_2756,Regular,7,6,0,0,Tuesday,0,1.8935,0,0.0,0,0.03,48700,384.0,15.0,Very High
21,10,bcycle_boulder_2144,Regular,15,2,0,0,Tuesday,0,1.8935,0,0.0,0,0.03,48700,444.0,21.0,Very High
22,10,bcycle_boulder_2132,Regular,18,0,0,1,Tuesday,0,1.8935,0,0.0,0,0.03,48700,54.0,63.0,High
25,10,bcycle_boulder_2760,Regular,12,1,0,0,Tuesday,0,1.8935,0,0.0,0,0.03,48700,288.0,9.0,Very High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
592078,12,bcycle_boulder_2767,Regular,12,2,0,0,Thursday,14,10.1935,0,0.0,0,0.00,84000,3.0,6.0,Very Low
592079,12,bcycle_boulder_2771,Regular,19,2,0,0,Thursday,14,10.1935,0,0.0,0,0.00,84000,6.0,3.0,Very Low
592081,12,bcycle_boulder_4657,Regular,18,1,0,0,Thursday,14,10.1935,0,0.0,0,0.00,84000,3.0,6.0,Very Low
592082,12,bcycle_boulder_3894,Regular,16,0,0,1,Thursday,14,10.1935,0,0.0,0,0.00,84000,3.0,9.0,Very Low


(94044, 34) (20681, 34)


## Test Train Split 80:20

In [5]:

X_bike= train_data.drop(columns= ['bike_wait_time','dock_wait_time','month_rnd','wait_time'])
Y_bike = train_data[['wait_time']]

# Assuming X_bike contains input features and Y_bike contains output variable
X_train, X_test, y_train, y_test = train_test_split(X_bike, Y_bike, test_size=0.2)

## Random Forest

In [491]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np


# Initialize the RandomForestClassifier
clf = RandomForestClassifier(random_state=42)

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print('------------------ Confusion Matrix -----------------\n')
print(confusion_matrix(y_test, y_pred))

print('-------------------- Classification Report --------------------')
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, y_pred)))

print(classification_report(y_test, y_pred))


  if sys.path[0] == '':


------------------ Confusion Matrix -----------------

[[1577  502  348  348]
 [ 631 1105   54 1591]
 [ 316   40 5746   83]
 [ 456 1194  107 4711]]
-------------------- Classification Report --------------------

Accuracy: 0.70

              precision    recall  f1-score   support

        High       0.53      0.57      0.55      2775
         Low       0.39      0.33      0.36      3381
   Very High       0.92      0.93      0.92      6185
    Very Low       0.70      0.73      0.71      6468

    accuracy                           0.70     18809
   macro avg       0.63      0.64      0.64     18809
weighted avg       0.69      0.70      0.69     18809



### RF - Hyper parameter

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 5, 10, 15],   # Maximum depth of the trees
    'min_samples_split': [2, 5, 10]   # Minimum number of samples required to split a node
}

# Initialize the RandomForestClassifier
clf = RandomForestClassifier(random_state=42)

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters found by GridSearchCV
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Use the best estimator found by GridSearchCV
best_clf = grid_search.best_estimator_

# Make predictions on the test set using the best estimator
y_pred = best_clf.predict(X_test)

# Evaluate the model
print('------------------ Confusion Matrix -----------------\n')
print(confusion_matrix(y_test, y_pred))

print('-------------------- Classification Report --------------------')
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, y_pred)))

print(classification_report(y_test, y_pred))


### RF- Dec Data

In [492]:
# Test for December data
print()

test_x= test_data.drop(columns= ['bike_wait_time','dock_wait_time','month_rnd','wait_time'])
test_y= test_data[['wait_time']]

# Make predictions on the test set
predictions = clf.predict(test_x)

# Evaluate the model
accuracy = accuracy_score(test_y, predictions)
print(f"Accuracy: {accuracy:.2f}")

# View a detailed classification report
print(classification_report(test_y, predictions))

print('-----------------------------------filter ------')
# Filters
filter_regular_class= (test_x['day_of_week_rnd_Monday']==1) | (test_x['day_of_week_rnd_Tuesday']==1) | (test_x['day_of_week_rnd_Wednesday']==1) | (test_x['day_of_week_rnd_Thursday']==1) | (test_x['day_of_week_rnd_Friday']==1)
filter_mrng_to_eve= (test_x['hour_rnd'] >= 9) & (test_x['hour_rnd'] <= 20)



test_data['pred'] = predictions
# Filter based on indexes- change as required
index= test_data[filter_regular_class & filter_mrng_to_eve].index
actual= test_data['wait_time'].loc[index]
predicted= test_data['pred'].loc[index]


# Evaluate the model
accuracy = accuracy_score(actual, predicted)
print(f"Accuracy: {accuracy:.2f}")

# View a detailed classification report
print(classification_report(actual, predicted))



Accuracy: 0.53
              precision    recall  f1-score   support

        High       0.17      0.06      0.09      2820
         Low       0.26      0.07      0.12      4050
   Very High       0.58      0.76      0.66      5390
    Very Low       0.56      0.76      0.65      8421

    accuracy                           0.53     20681
   macro avg       0.39      0.41      0.38     20681
weighted avg       0.45      0.53      0.47     20681

-----------------------------------filter ------
Accuracy: 0.58


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


              precision    recall  f1-score   support

        High       0.10      0.04      0.05       808
         Low       0.25      0.08      0.13      1829
   Very High       0.24      0.07      0.11       307
    Very Low       0.64      0.90      0.75      4711

    accuracy                           0.58      7655
   macro avg       0.31      0.27      0.26      7655
weighted avg       0.47      0.58      0.50      7655



## SVM

In [13]:
from sklearn.svm import SVC

# Define the SVM model with the best parameters
svm_best = SVC(C=1, gamma=0.1, kernel='rbf', random_state=42)  # Replace with best_params_svm

# Train the SVM model on the entire dataset
svm_best.fit(X_train, y_train)

# Make predictions on the test set
predictions = svm_best.predict(X_test)

# Evaluate the SVM model
print('------------------ SVM Confusion Matrix -----------------\n')
print(confusion_matrix(y_test, predictions))

print('-------------------- SVM Classification Report --------------------')
print('\nAccuracy (SVM): {:.2f}\n'.format(accuracy_score(y_test, predictions)))

print(classification_report(y_test, predictions))


  y = column_or_1d(y, warn=True)


------------------ SVM Confusion Matrix -----------------

[[1210  330  377  718]
 [ 599  480  122 2291]
 [ 333   44 5705  126]
 [ 419  609  191 5255]]
-------------------- SVM Classification Report --------------------

Accuracy (SVM): 0.67

              precision    recall  f1-score   support

        High       0.47      0.46      0.47      2635
         Low       0.33      0.14      0.19      3492
   Very High       0.89      0.92      0.91      6208
    Very Low       0.63      0.81      0.71      6474

    accuracy                           0.67     18809
   macro avg       0.58      0.58      0.57     18809
weighted avg       0.64      0.67      0.64     18809



### SVM Dec -data

In [15]:
# Test for December data
print()

test_x= test_data.drop(columns= ['bike_wait_time','dock_wait_time','month_rnd','wait_time'])
test_y= test_data[['wait_time']]

# Make predictions on the test set
predictions = svm_best.predict(test_x)

# Evaluate the model
accuracy = accuracy_score(test_y, predictions)
print(f"Accuracy: {accuracy:.2f}")

# View a detailed classification report
print(classification_report(test_y, predictions))

print('-----------------------------------filter ------')
# Filters
filter_regular_class= (test_x['day_of_week_rnd_Monday']==1) | (test_x['day_of_week_rnd_Tuesday']==1) | (test_x['day_of_week_rnd_Wednesday']==1) | (test_x['day_of_week_rnd_Thursday']==1) | (test_x['day_of_week_rnd_Friday']==1)
filter_mrng_to_eve= (test_x['hour_rnd'] >= 9) & (test_x['hour_rnd'] <= 20)



test_data['pred'] = predictions
# Filter based on indexes- change as required
index= test_data[filter_regular_class & filter_mrng_to_eve].index
actual= test_data['wait_time'].loc[index]
predicted= test_data['pred'].loc[index]


# Evaluate the model
accuracy = accuracy_score(actual, predicted)
print(f"Accuracy: {accuracy:.2f}")

# View a detailed classification report
print(classification_report(actual, predicted))





Feature names unseen at fit time:
- pred
Feature names must be in the same order as they were in fit.



ValueError: could not convert string to float: 'Very High'

## XGboost

In [9]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


# Define the Gradient Boosting Classifier model with the best parameters
gb_best = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)  # Replace with best_params_gb

# Train the Gradient Boosting Classifier model on the entire dataset
gb_best.fit(X_train, y_train)

# Make predictions on the test set
predictions = gb_best.predict(X_test)

# Evaluate the Gradient Boosting Classifier model
print('------------------ Gradient Boosting Confusion Matrix -----------------\n')
print(confusion_matrix(y_test, predictions))

print('-------------------- Gradient Boosting Classification Report --------------------')
print('\nAccuracy (Gradient Boosting): {:.2f}\n'.format(accuracy_score(y_test, predictions)))

print(classification_report(y_test, predictions))


  y = column_or_1d(y, warn=True)


------------------ Gradient Boosting Confusion Matrix -----------------

[[ 364  108  925 1238]
 [ 246  127  583 2536]
 [ 173   22 5661  352]
 [ 174  121  597 5582]]
-------------------- Gradient Boosting Classification Report --------------------

Accuracy (Gradient Boosting): 0.62

              precision    recall  f1-score   support

        High       0.38      0.14      0.20      2635
         Low       0.34      0.04      0.07      3492
   Very High       0.73      0.91      0.81      6208
    Very Low       0.57      0.86      0.69      6474

    accuracy                           0.62     18809
   macro avg       0.51      0.49      0.44     18809
weighted avg       0.55      0.62      0.55     18809



### XG dec data

In [11]:
# Test for December data
print()

test_x= test_data.drop(columns= ['bike_wait_time','dock_wait_time','month_rnd','wait_time'])
test_y= test_data[['wait_time']]

# Make predictions on the test set
predictions = gb_best.predict(test_x)

# Evaluate the model
accuracy = accuracy_score(test_y, predictions)
print(f"Accuracy: {accuracy:.2f}")

# View a detailed classification report
print(classification_report(test_y, predictions))

print('-----------------------------------filter ------')
# Filters
filter_regular_class= (test_x['day_of_week_rnd_Monday']==1) | (test_x['day_of_week_rnd_Tuesday']==1) | (test_x['day_of_week_rnd_Wednesday']==1) | (test_x['day_of_week_rnd_Thursday']==1) | (test_x['day_of_week_rnd_Friday']==1)
filter_mrng_to_eve= (test_x['hour_rnd'] >= 9) & (test_x['hour_rnd'] <= 20)



test_data['pred'] = predictions
# Filter based on indexes- change as required
index= test_data[filter_regular_class & filter_mrng_to_eve].index
actual= test_data['wait_time'].loc[index]
predicted= test_data['pred'].loc[index]


# Evaluate the model
accuracy = accuracy_score(actual, predicted)
print(f"Accuracy: {accuracy:.2f}")

# View a detailed classification report
print(classification_report(actual, predicted))


Accuracy: 0.56
              precision    recall  f1-score   support

        High       0.26      0.06      0.10      2820
         Low       0.26      0.04      0.07      4050
   Very High       0.56      0.87      0.68      5390
    Very Low       0.60      0.78      0.68      8421

    accuracy                           0.56     20681
   macro avg       0.42      0.44      0.38     20681
weighted avg       0.47      0.56      0.48     20681

-----------------------------------filter ------
Accuracy: 0.61


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


              precision    recall  f1-score   support

        High       0.22      0.01      0.01       808
         Low       0.29      0.02      0.03      1829
   Very High       0.33      0.07      0.11       307
    Very Low       0.62      0.99      0.76      4711

    accuracy                           0.61      7655
   macro avg       0.37      0.27      0.23      7655
weighted avg       0.49      0.61      0.48      7655



Task for niranjan
* do hyperparamter tuning for all the models
* try to normalize some parameters - i removed this  
* try PCA 
* avlothan
* SVM performs better without anything but it is taking so much time to train