In [24]:
# Step 1.Importing Libraries and Reading Dataset

# Importing libraries and reading dataset
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.model_selection import train_test_split,RandomizedSearchCV,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

file_path = "D:/Data Science and Data Engineering/Semester 1/Period 2/Statistical Machine Learning/SML_Project/training_data_fall2024.csv"
# file_path = '/data/training_data_fall2024.csv'
data = pd.read_csv(file_path)
data

Unnamed: 0,hour_of_day,day_of_week,month,holiday,weekday,summertime,temp,dew,humidity,precip,snow,snowdepth,windspeed,cloudcover,visibility,increase_stock
0,5,5,1,0,0,0,-7.2,-15.0,53.68,0.000,0,0.0,16.3,31.6,16.0,low_bike_demand
1,21,4,1,0,1,0,-1.3,-12.8,40.97,0.000,0,0.0,23.9,85.7,16.0,low_bike_demand
2,21,3,8,0,1,1,26.9,21.8,73.39,0.000,0,0.0,0.0,81.1,16.0,low_bike_demand
3,1,6,1,0,0,0,3.1,-4.0,59.74,0.000,0,0.0,19.2,0.0,16.0,low_bike_demand
4,17,0,3,0,1,0,11.7,-11.4,18.71,0.000,0,0.0,10.5,44.6,16.0,low_bike_demand
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,3,5,6,0,0,1,21.5,19.4,87.68,0.000,0,0.0,10.6,24.4,16.0,low_bike_demand
1596,14,0,6,0,1,1,23.2,20.1,82.43,2.217,0,0.0,9.8,92.1,10.4,low_bike_demand
1597,13,0,3,0,1,1,13.9,-2.2,32.93,0.000,0,2.0,18.2,79.3,16.0,low_bike_demand
1598,14,5,3,0,0,1,11.7,-9.3,22.09,0.000,0,0.0,5.8,24.4,16.0,high_bike_demand


In [13]:
# Step 2.Data Pre-Processing Step

# Check for missing values
missing_values = data.isnull().sum()
# Display columns with missing values
missing_values[missing_values>0]
# Print missing values
print(missing_values)


hour_of_day       0
day_of_week       0
month             0
holiday           0
weekday           0
summertime        0
temp              0
dew               0
humidity          0
precip            0
snow              0
snowdepth         0
windspeed         0
cloudcover        0
visibility        0
increase_stock    0
dtype: int64


In [14]:
# Step 3. Handling Categorical Data

from sklearn.preprocessing import LabelEncoder

# Encode the target variable
label_encoder = LabelEncoder()
data['increase_stock'] = label_encoder.fit_transform(data['increase_stock'])

# Check the mapping of the categories to numbers
label_mapping = dict(zip(label_encoder.classes_,label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

{'high_bike_demand': 0, 'low_bike_demand': 1}


In [15]:
# Step 4. Dependent and independent variables

# Define features (X) and target (y)
X = data.drop(columns=['increase_stock'])
y = data['increase_stock']

# Display the shapes of X and y
print(X.shape,y.shape)

(1600, 15) (1600,)


In [16]:
# Step 5. Splitting dataset into training and testing set

# Split the dataset into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2,random_state=0)

# Display the shapes of the training and testing set
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(1280, 15) (320, 15) (1280,) (320,)


In [17]:
# Step 6. Implementing a Random Forest Classifier

# Initialize and train a random forest classifier
rf_classifier = RandomForestClassifier(random_state=0)
rf_classifier.fit(X_train,y_train)

# Check features importances for interpretability
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'importance': rf_classifier.feature_importances_
}).sort_values(by='importance', ascending=False)

# Display feature importance
print(feature_importances)

        Feature  importance
0   hour_of_day    0.235230
6          temp    0.161585
8      humidity    0.157025
7           dew    0.101513
12    windspeed    0.090693
13   cloudcover    0.080448
2         month    0.049208
1   day_of_week    0.048530
5    summertime    0.023704
4       weekday    0.020406
14   visibility    0.016012
9        precip    0.010106
3       holiday    0.005083
11    snowdepth    0.000459
10         snow    0.000000


In [18]:
# Step 7. Predicting testcases using Random Forest

# Make prediction on the test set
y_pred = rf_classifier.predict(X_test)

# Display predictions for a sample of test set
y_pred[:20]

array([1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1])

In [19]:
# Step 8. Checking accuracy score

# Calculate accuracy
accuracy = accuracy_score(y_test,y_pred)

# Generate a classification report
class_report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
# Display results
print(accuracy,class_report)

0.86875                   precision    recall  f1-score   support

high_bike_demand       0.68      0.52      0.59        58
 low_bike_demand       0.90      0.95      0.92       262

        accuracy                           0.87       320
       macro avg       0.79      0.73      0.76       320
    weighted avg       0.86      0.87      0.86       320



In [20]:
# Step 9.Random search to tune the Random Forest Method

# Define parameter distribution
pram_dist = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator = RandomForestClassifier(random_state=0),
    param_distributions=pram_dist,
    n_iter=20,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=0
)

# Fitting the model
random_search.fit(X_train,y_train)

# Output of parameters and score
print("Best Parameters:",random_search.best_params_)
print("Best Cross Vadidation Score:",random_search.best_score_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': None}
Best Cross Vadidation Score: 0.8937596434710265


In [21]:
# Step 10. Inserting or creating new Feature to test the Random Forests performance

# Adding a feature for weather condition as temp and dew
X['temp_dew_diff'] = X['temp'] - X['dew']

# Updating the training and testing data with new features
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

# Initialize the Random Forest with predefined parameters
predefined_rf = RandomForestClassifier(
n_estimators=100,
max_depth=20,
min_samples_split=5,
min_samples_leaf=2,
random_state=0
)

# Perform 5-Fold cross validation
cv_score = cross_val_score(predefined_rf,X_train,y_train,cv=5,scoring='accuracy')

# Calculate mean and standard deviation of cross validation scores
cv_mean = np.mean(cv_score)
cv_std = np.std(cv_score)

print(cv_mean,cv_std)

0.89375 0.011428702217443588


In [22]:
# Step 11. Assuming y_test contains the true labels and y_pred contains the predicted labels

cm = confusion_matrix(y_test,y_pred)

print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[ 30  28]
 [ 14 248]]
