In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder


In [2]:
# Load data
df = pd.read_csv('bikedata.csv', encoding='ISO-8859-1')

In [3]:
# Print columns before renaming
print("Columns before renaming:\n", df.columns)

Columns before renaming:
 Index(['Date', 'Rented Bike Count', 'Hour', 'Temperature(°C)', 'Humidity(%)',
       'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(°C)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons',
       'Holiday', 'Functioning Day'],
      dtype='object')


In [4]:
df.rename({"Temperature(°C)": "Temperature",  
               "Functioning Day":"Functioning_Day",
                "Humidity(%)": "Humidity",  
                "Wind speed (m/s)": "Wind_speed",
                "Visibility (10m)": "Visibility",
                "Dew point temperature(°C)": "Dew_point_temperature",
                "Solar Radiation (MJ/m2)": "Solar_Radiation",
                "Snowfall (cm)": "Snowfall",
                "Rainfall(mm)": "Rainfall",
                "Rented Bike Count": "Rented_Bike_Count"},  
                axis = "columns", inplace = True) 

In [5]:
# Print columns after renaming
print("Columns after renaming:\n", df.columns)

Columns after renaming:
 Index(['Date', 'Rented_Bike_Count', 'Hour', 'Temperature', 'Humidity',
       'Wind_speed', 'Visibility', 'Dew_point_temperature', 'Solar_Radiation',
       'Rainfall', 'Snowfall', 'Seasons', 'Holiday', 'Functioning_Day'],
      dtype='object')


In [6]:
# Parse date
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

In [7]:
# Create time-related features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek

In [8]:
# Print columns before pd.get_dummies
print("Columns before pd.get_dummies:\n", df.columns)

Columns before pd.get_dummies:
 Index(['Date', 'Rented_Bike_Count', 'Hour', 'Temperature', 'Humidity',
       'Wind_speed', 'Visibility', 'Dew_point_temperature', 'Solar_Radiation',
       'Rainfall', 'Snowfall', 'Seasons', 'Holiday', 'Functioning_Day', 'Year',
       'Month', 'Day', 'DayOfWeek'],
      dtype='object')


In [9]:
df.head()

Unnamed: 0,Date,Rented_Bike_Count,Hour,Temperature,Humidity,Wind_speed,Visibility,Dew_point_temperature,Solar_Radiation,Rainfall,Snowfall,Seasons,Holiday,Functioning_Day,Year,Month,Day,DayOfWeek
0,2017-12-01,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes,2017,12,1,4
1,2017-12-01,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes,2017,12,1,4
2,2017-12-01,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes,2017,12,1,4
3,2017-12-01,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes,2017,12,1,4
4,2017-12-01,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes,2017,12,1,4


In [10]:
# Manually check if the expected columns are present
expected_columns = ['Seasons', 'Holiday', 'Functioning_Day']
missing_columns = [col for col in expected_columns if col not in df.columns]
if missing_columns:
    print(f"Missing columns: {missing_columns}")
else:
    # One-hot encode categorical features
    df = pd.get_dummies(df, columns=['Seasons', 'Holiday', 'Functioning_Day'])

In [11]:
# Print columns after pd.get_dummies
print("Columns after pd.get_dummies:\n", df.columns)

Columns after pd.get_dummies:
 Index(['Date', 'Rented_Bike_Count', 'Hour', 'Temperature', 'Humidity',
       'Wind_speed', 'Visibility', 'Dew_point_temperature', 'Solar_Radiation',
       'Rainfall', 'Snowfall', 'Year', 'Month', 'Day', 'DayOfWeek',
       'Seasons_Autumn', 'Seasons_Spring', 'Seasons_Summer', 'Seasons_Winter',
       'Holiday_Holiday', 'Holiday_No Holiday', 'Functioning_Day_No',
       'Functioning_Day_Yes'],
      dtype='object')


In [12]:
# Select features and target
features = ['Hour', 'Temperature', 'Humidity', 'Wind_speed', 'Visibility', 
            'Dew_point_temperature', 'Solar_Radiation', 'Rainfall', 'Snowfall',
            'Year', 'Month', 'Day', 'DayOfWeek',
            'Seasons_Autumn', 'Seasons_Spring', 'Seasons_Summer', 'Seasons_Winter',
            'Holiday_Holiday', 'Holiday_No Holiday', 'Functioning_Day_No', 'Functioning_Day_Yes']
X = df[features]
y = df['Rented_Bike_Count']

In [13]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [15]:
# Predict on validation set
y_pred = model.predict(X_val)

In [16]:
# Evaluate the model
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)

print("Mean Absolute Error: ", mae)
print("R2 Score: ", r2)
print("Mean Squared Error: ", mse)

Mean Absolute Error:  330.5708810592736
R2 Score:  0.5361985041868915
Mean Squared Error:  193241.14827407734


In [17]:
# Check sample predictions
print("Sample Predictions vs Actual:")
for i in range(10):
    print(f"Predicted: {y_pred[i]}, Actual: {y_val.iloc[i]}")

Sample Predictions vs Actual:
Predicted: 897.9983222021256, Actual: 1728
Predicted: 1127.8277131770737, Actual: 822
Predicted: 1343.8756439820863, Actual: 658
Predicted: 1358.4145284397528, Actual: 2716
Predicted: 561.1641619249713, Actual: 1083
Predicted: 747.8093149573542, Actual: 636
Predicted: 1484.223426131066, Actual: 1537
Predicted: 1024.9719921050128, Actual: 712
Predicted: 888.6669264957309, Actual: 425
Predicted: 870.6557661562692, Actual: 594


RandomForest

In [20]:
# Split data into training and validation sets
from sklearn.ensemble import RandomForestRegressor


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest regression model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on validation set
y_pred = rf_model.predict(X_val)

# Evaluate the model
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)

print("Mean Absolute Error: ", mae)
print("R2 Score: ", r2)
print("Mean Squared Error: ", mse)

# Check sample predictions
print("Sample Predictions vs Actual:")
for i in range(10):
    print(f"Predicted: {y_pred[i]}, Actual: {y_val.iloc[i]}")

Mean Absolute Error:  97.86505136986301
R2 Score:  0.9282501789253162
Mean Squared Error:  29894.29300702055
Sample Predictions vs Actual:
Predicted: 1773.26, Actual: 1728
Predicted: 725.26, Actual: 822
Predicted: 679.32, Actual: 658
Predicted: 2368.95, Actual: 2716
Predicted: 817.34, Actual: 1083
Predicted: 798.9, Actual: 636
Predicted: 1688.01, Actual: 1537
Predicted: 721.21, Actual: 712
Predicted: 482.97, Actual: 425
Predicted: 483.72, Actual: 594
