In [12]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import warnings

warnings.filterwarnings("ignore")

# Prepare Test Data (December only)

In [7]:
merged = pd.read_csv('/content/drive/MyDrive/CSCI 5502 Data Mining/01 Project/Data/01_Preprocessed_Data/02_Master_Analysis_Data_With_Temp_10_31_12_07.csv')
merged.drop(columns='Unnamed: 0', inplace=True)
print("Does the DataFrame have nulls?", merged.isna().any().any())
merged.head()

# Station Name and ID mapping
all_id_names= merged[['station_id','station_name']].value_counts().reset_index()
id=all_id_names.station_id.values
name= all_id_names.station_name.values

stations_name_mapping=dict()
for i in range(len(id)):
  stations_name_mapping[id[i]]= name[i]
stations_name_mapping

# Data
merged_df = merged

#Selecting top 10 popular stations
selected_stations = ['bcycle_boulder_3894', 'bcycle_boulder_2132', 'bcycle_boulder_2771','bcycle_boulder_4657','bcycle_boulder_1855','bcycle_boulder_2760','bcycle_boulder_2767','bcycle_boulder_4091','bcycle_boulder_2144','bcycle_boulder_2756']
df= merged_df.copy()
df= df[df['station_id'].isin(selected_stations)]

# Calculatig all bikes/docks available flag
df['all_bikes_avl_flag'] = (df['station_capacity'] == df['bikes_available']).astype(int)
df['all_docks_avl_flag'] = (df['station_capacity'] == df['docks_available']).astype(int)

# Select only the required columns (include month_rnd to split train data and remove it befor modelling)
req_cols= ['month_rnd','station_id','cu_class_status','all_bikes_avl_flag',
           'all_docks_avl_flag','day_of_week_rnd','hour_rnd','temperature_2m','precipitation_probability',
           'visibility','bike_wait_time'] #'snow_depth', 'snowfall', 'rain', 'docks_available', 'bikes_available'
df= df[req_cols]

# Function to assign categories based on bike_wait_time
def categorize_wait_time(time):
    if time < 20:
        return "Very Low"
    elif 20 <= time < 40:
        return "Low"
    elif 40 <= time < 60:
        return "High"
    else:
        return "Very High"

  # Apply the function to create a new column 'wait_time_category'
df['wait_time'] = df['bike_wait_time'].apply(categorize_wait_time)

# Label Encoding and creating a dictionary for station_id enocder mapping
columns_to_encode = ['cu_class_status', 'day_of_week_rnd', 'station_id']
label_encoder = LabelEncoder()
for col in columns_to_encode:
    df[col + '_encoded'] = label_encoder.fit_transform(df[col])

mapping= df[['station_id','station_id_encoded']].value_counts().reset_index()
id= mapping.station_id.values
encoded= mapping.station_id_encoded.values
station_dict=dict()
for i in range(len(id)):
  station_dict[encoded[i]]= id[i]

df.drop(columns=columns_to_encode, inplace=True)

# Normalization
cols_to_normalize= ['temperature_2m','precipitation_probability','visibility']
scaler = MinMaxScaler()
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])


#split the Dec data for testing and drop unwanted columns
test_data= df[df['month_rnd'] == 12].drop(columns= ['month_rnd','bike_wait_time'])
test_x= test_data.drop(columns= ['wait_time'])
test_y= test_data[['wait_time']]

test_data.head()

Does the DataFrame have nulls? False


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['all_bikes_avl_flag'] = (df['station_capacity'] == df['bikes_available']).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['all_docks_avl_flag'] = (df['station_capacity'] == df['docks_available']).astype(int)


Unnamed: 0,all_bikes_avl_flag,all_docks_avl_flag,hour_rnd,temperature_2m,precipitation_probability,visibility,wait_time,cu_class_status_encoded,day_of_week_rnd_encoded,station_id_encoded
488518,0,0,11,0.170579,0.0,0.309922,Very Low,0,2,0
488520,0,0,11,0.170579,0.0,0.309922,Very Low,0,2,4
488523,0,0,11,0.170579,0.0,0.309922,Very Low,0,2,3
488527,0,0,11,0.170579,0.0,0.309922,Very Low,0,2,2
488528,0,0,11,0.170579,0.0,0.309922,Very Low,0,2,1


# Load Best Model

In [6]:
model= pickle.load(open('/content/drive/MyDrive/CSCI 5502 Data Mining/01 Project/Model Pickle Files/Best_RF_Model.pkl', 'rb'))


# Testing on the December data

In [14]:
pred_y = model.predict(test_x)

print('-------------------Testing on the entire December dataset-------------------')
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(test_y, pred_y)))
print('\nConfusion Matrix:')
print(confusion_matrix(test_y, pred_y))
print('\nClassification Report:')
print(classification_report(test_y, pred_y))

print("\nAccuracy Split By Stations:")
test_data['pred']= pred_y

id=[]
name=[]
accuracy=[]

for i in test_data['station_id_encoded'].unique():
  data= test_data[test_data["station_id_encoded"]==i]

  actual= data['wait_time']
  predicted= data['pred']
  acc= accuracy_score(actual, predicted)

  id.append(station_dict[i])
  name.append(stations_name_mapping[station_dict[i]])
  accuracy.append(acc)

accuracy_by_stations= pd.DataFrame({'Station_ID':id, 'Station_Name':name, 'Accuracy': accuracy})
accuracy_by_stations.sort_values(by='Accuracy', inplace=True, ascending=False)
display(accuracy_by_stations)

-------------------Testing on the entire December dataset-------------------

Accuracy: 0.66


Confusion Matrix:
[[   0    0  543  705]
 [   0    0  952 2587]
 [   0    0 4475  915]
 [   0    0 1330 9174]]

Classification Report:
              precision    recall  f1-score   support

        High       0.00      0.00      0.00      1248
         Low       0.00      0.00      0.00      3539
   Very High       0.61      0.83      0.71      5390
    Very Low       0.69      0.87      0.77     10504

    accuracy                           0.66     20681
   macro avg       0.32      0.43      0.37     20681
weighted avg       0.51      0.66      0.57     20681


Accuracy Split By Stations


Unnamed: 0,Station_ID,Station_Name,Accuracy
8,bcycle_boulder_3894,Center for Community @ Regent Drive,0.751022
4,bcycle_boulder_2132,CU Recreation Center,0.722651
5,bcycle_boulder_2771,Williams Village,0.711826
9,bcycle_boulder_4091,Timber Ridge @ Adams Circle,0.702677
0,bcycle_boulder_1855,Folsom & Colorado,0.664213
7,bcycle_boulder_2767,18th & Colorado,0.661335
1,bcycle_boulder_2760,13th & College,0.620862
6,bcycle_boulder_4657,Farrand Field,0.604506
3,bcycle_boulder_2144,Broadway & Euclid,0.592063
2,bcycle_boulder_2756,Broadway & Baseline,0.587114


In [16]:
print("-------------------Testing on December data's Popular Time-------------------")

## Filters the data for popular times
filter_regular_class= (test_data['cu_class_status_encoded']==1)
filter_mrng_to_eve= (test_data['hour_rnd'] >= 9) & (test_data['hour_rnd'] <= 20)

filted_data= test_data[filter_regular_class & filter_mrng_to_eve]
actual= filted_data['wait_time']
predicted= filted_data['pred']

print('\nAccuracy: {:.2f}\n'.format(accuracy_score(actual, predicted)))
print('\nConfusion Matrix:')
print(confusion_matrix(actual, predicted))
print('\nClassification Report:')
print(classification_report(actual, predicted))

# 3. Check the accuracy of the December's popular time split by station IDs
print("Accuracy Split By Stations")
id=[]
name=[]
accuracy=[]

for i in filted_data['station_id_encoded'].unique():
  data= filted_data[filted_data["station_id_encoded"]==i]

  actual= data['wait_time']
  predicted= data['pred']
  acc= accuracy_score(actual, predicted)

  id.append(station_dict[i])
  name.append(stations_name_mapping[station_dict[i]])
  accuracy.append(acc)

accuracy_by_stations= pd.DataFrame({'Station_ID':id, 'Station_Name':name, 'Accuracy': accuracy})
accuracy_by_stations.sort_values(by='Accuracy', inplace=True, ascending=False)
display(accuracy_by_stations)

-------------------Testing on December data's Popular Time-------------------

Accuracy: 0.75


Confusion Matrix:
[[   0    0    3  269]
 [   0    0    7 1348]
 [   0    0   19  288]
 [   0    0    9 5712]]

Classification Report:
              precision    recall  f1-score   support

        High       0.00      0.00      0.00       272
         Low       0.00      0.00      0.00      1355
   Very High       0.50      0.06      0.11       307
    Very Low       0.75      1.00      0.86      5721

    accuracy                           0.75      7655
   macro avg       0.31      0.27      0.24      7655
weighted avg       0.58      0.75      0.64      7655

Accuracy Split By Stations


Unnamed: 0,Station_ID,Station_Name,Accuracy
8,bcycle_boulder_3894,Center for Community @ Regent Drive,0.89715
5,bcycle_boulder_2771,Williams Village,0.846248
3,bcycle_boulder_2132,CU Recreation Center,0.822002
4,bcycle_boulder_1855,Folsom & Colorado,0.819231
9,bcycle_boulder_4091,Timber Ridge @ Adams Circle,0.768551
6,bcycle_boulder_2767,18th & Colorado,0.760705
0,bcycle_boulder_2760,13th & College,0.683107
1,bcycle_boulder_2756,Broadway & Baseline,0.680247
2,bcycle_boulder_2144,Broadway & Euclid,0.652005
7,bcycle_boulder_4657,Farrand Field,0.530347
