In [None]:
# Step 1.Importing Libraries #
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import *
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

In [None]:
# Step 2.Reading Dataset #
label_encoder = preprocessing.LabelEncoder()
file_path = 'training_data_fall2024.csv'
data = pd.read_csv(file_path)
# Preserving the label encoder used for 'increase_stock'
increase_stock_encoder = label_encoder.fit(data[data.columns[15]])
data[data.columns[15]]= increase_stock_encoder.transform(data[data.columns[15]])
print(data)

In [None]:
# Step 3.Data Pre-Processing Step #
# Check for missing values
missing_values = data.isnull().sum()
# Display columns with missing values
missing_values[missing_values>0]
# Print missing values
print(missing_values)

In [None]:
# Step 4. Handling Categorical Data #
# Encode the target variable
label_encoder = LabelEncoder()
data['increase_stock'] = label_encoder.fit_transform(data['increase_stock'])
# Check the mapping of the categories to numbers
label_mapping = dict(zip(label_encoder.classes_,label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

In [None]:
# Step 5. Dependent and independent variables #
# Define features (X) and target (y)
X = data.drop(columns=['increase_stock'])
y = data['increase_stock']
# Display the shapes of X and y
print(X.shape,y.shape)

In [None]:
# Step 6. Splitting dataset into training and testing set #
# Split the dataset into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2,random_state=0)
# Display the shapes of the training and testing set
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
# Step 7. Implementing a Random Forest Classifier #
# Initialize and train a random forest classifier
rf_classifier = RandomForestClassifier(random_state=0)
rf_classifier.fit(X_train,y_train)
# Check features importances for interpretability
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'importance': rf_classifier.feature_importances_
}).sort_values(by='importance', ascending=False)
# Display feature importance
print(feature_importances)

In [None]:
# Step 8. Predicting testcases using Random Forest #
# Make prediction on the test set
y_pred = rf_classifier.predict(X_test)
# Display predictions for a sample of test set
y_pred[:10]

In [None]:
# Step 9. Checking accuracy score #
# Calculate accuracy
accuracy = accuracy_score(y_test,y_pred)
# Instead of using the encoder, define target names directly as strings
target_names = ['Decrease', 'Increase']
# Generate a classification report with the string target names
class_report = classification_report(y_test, y_pred, target_names=target_names)
print(accuracy,class_report)

In [None]:
# Step 10.Random search to tune the Random Forest Method #
# Define parameter distribution
data_array = np.array(data, dtype=float)
X = data_array[:, :-1]
y = data_array[:, -1]
pram_dist = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator = RandomForestClassifier(random_state=0),
    param_distributions=pram_dist,
    n_iter=20,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=0
)
# Fitting the model
random_search.fit(X_train,y_train)
# Output of parameters and score
print("Best Parameters:",random_search.best_params_)
print("Best Cross Vadidation Score:",random_search.best_score_)

In [None]:
# Step 11. Inserting or creating new Feature to test the Random Forests performance #
# Convert X back to DataFrame to use column names:
X = pd.DataFrame(X, columns=data.columns[:-1])  # Assuming original columns are in 'data'

# Now you can use column names for calculations:
X['temp_dew_diff'] = X['temp'] - X['dew']

# Updating the training and testing data with new features
# Convert X back to a NumPy array for train_test_split if necessary:
X = X.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize the Random Forest with predefined parameters
predefined_rf = RandomForestClassifier(
n_estimators=100,
max_depth=20,
min_samples_split=5,
min_samples_leaf=2,
random_state=0
)
# Perform 5-Fold cross validation
cv_score = cross_val_score(predefined_rf,X_train,y_train,cv=5,scoring='accuracy')
# Calculate mean and standard deviation of cross validation scores
cv_mean = np.mean(cv_score)
cv_std = np.std(cv_score)
print(cv_mean,cv_std)

In [None]:
# Step 12. Assuming y_test contains the true labels and y_pred contains the predicted labels #
cm = confusion_matrix(y_test,y_pred)
print("Confusion Matrix:")
print(cm)

In [None]:
# Naive Classification Model
# Extract the target variable
y_true = data['increase_stock']

# Method 1: Always predict 'low_bike_demand'
y_pred_low = ['low_bike_demand'] * len(y_true)

# Method 2: Always predict 'high_bike_demand'
y_pred_high = ['high_bike_demand'] * len(y_true)

# Method 3: Random predictions
np.random.seed(0)  # For reproducibility
y_pred_random = np.random.choice(['low_bike_demand', 'high_bike_demand'], size=len(y_true))

# Compare predictions
print("Sample predictions:")
print(f"Always 'low_bike_demand': {y_pred_low[:3]}")
print(f"Always 'high_bike_demand': {y_pred_high[:3]}")
print(f"Random predictions: {y_pred_random[:3]}")
