<a href="https://colab.research.google.com/github/Nova-Mok/Demand-for-shared-bikes/blob/main/Bike_Demand.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import scipy.stats as stats

# here we will load the dataset
df = pd.read_csv('/content/sample_data/day.csv')  # Replace with your actual dataset path

print(df.head())
print(df.info())

# dataset for numerical columns
print(df.describe())

# Data Preprocessing
# Converting numerical codes to categorical for better readability and analysis
df['season'] = df['season'].map({1: 'spring', 2: 'summer', 3: 'fall', 4: 'winter'})
df['weathersit'] = df['weathersit'].map({
    1: 'clear',
    2: 'mist',
    3: 'light_rain',
    4: 'heavy_rain'
})
df['yr'] = df['yr'].map({0: '2018', 1: '2019'})

# Converting 'mnth' and 'weekday' to string to treat them as categorical variables
df['mnth'] = df['mnth'].astype(str)
df['weekday'] = df['weekday'].astype(str)

# dummy variables for categorical features
df = pd.get_dummies(df, drop_first=True)

# Extracting variable 'cnt'
y = df['cnt']

# Check for the columns before dropping them
columns_to_drop = ['instant', 'dteday', 'casual', 'registered', 'cnt']
df.drop(columns=[col for col in columns_to_drop if col in df.columns], axis=1, inplace=True)

X = df.apply(pd.to_numeric, errors='coerce')

# Check for missing or infinite values in the dataset
print("Missing values in the dataset:\n", X.isnull().sum())
print("Infinite values in the dataset:\n", np.isinf(X).sum())

# Handling missing or infinite values by filling with the mean
X.fillna(X.mean(), inplace=True)
X = X.astype(float)

# Check for high correlations among features
correlation_matrix = X.corr().abs()
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
high_correlation_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.8)]
print("Highly correlated features:\n", high_correlation_features)

# Drop highly correlated features
X = X.drop(columns=high_correlation_features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# printing R-squared score
r2 = r2_score(y_test, y_pred)
print(f'R-squared score on test data: {r2}')

# Residual Analysis
residuals = y_test - y_pred

# Plot residuals
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Residual Distribution')
plt.savefig('residual_distribution.png')
plt.close()

# Plot predicted vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Count')
plt.ylabel('Predicted Count')
plt.title('Actual vs Predicted Bike Demand')
plt.savefig('actual_vs_predicted.png')
plt.close()

# Check for multicollinearity using VIF after dropping highly correlated features
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print("Variance Inflation Factors:")
print(vif_data)

# Output the coefficients of the model
coef_df = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
print("Model Coefficients:")
print(coef_df)

# Q-Q Plot for Normality of Residuals
plt.figure(figsize=(10, 6))
stats.probplot(residuals, dist="norm", plot=plt)
plt.title('Q-Q Plot of Residuals')
plt.savefig('qq_plot.png')
plt.close()

# top 3 features contributing to bike demand
print("Top 3 features contributing to bike demand:")
print(coef_df.sort_values(by='Coefficient', ascending=False).head(3))

# plot
plt.figure(figsize=(12, 8))
coef_df['abs_coef'] = abs(coef_df['Coefficient'])
coef_df = coef_df.sort_values('abs_coef', ascending=False)
sns.barplot(x='abs_coef', y=coef_df.index, data=coef_df)
plt.title('Feature Importance')
plt.xlabel('Absolute Coefficient Value')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

   instant      dteday  season  yr  mnth  holiday  weekday  workingday  \
0        1  01-01-2018       1   0     1        0        6           0   
1        2  02-01-2018       1   0     1        0        0           0   
2        3  03-01-2018       1   0     1        0        1           1   
3        4  04-01-2018       1   0     1        0        2           1   
4        5  05-01-2018       1   0     1        0        3           1   

   weathersit       temp     atemp      hum  windspeed  casual  registered  \
0           2  14.110847  18.18125  80.5833  10.749882     331         654   
1           2  14.902598  17.68695  69.6087  16.652113     131         670   
2           1   8.050924   9.47025  43.7273  16.636703     120        1229   
3           1   8.200000  10.60610  59.0435  10.739832     108        1454   
4           1   9.305237  11.46350  43.6957  12.522300      82        1518   

    cnt  
0   985  
1   801  
2  1349  
3  1562  
4  1600  
<class 'pandas.core.frame.

  vif = 1. / (1. - r_squared_i)


Variance Inflation Factors:
                   Feature  VIF
0                  holiday  inf
1               workingday  inf
2                     temp  inf
3                      hum  inf
4                windspeed  inf
..                     ...  ...
752              weekday_4  inf
753              weekday_5  inf
754              weekday_6  inf
755  weathersit_light_rain  inf
756        weathersit_mist  inf

[757 rows x 2 columns]
Model Coefficients:
                       Coefficient
holiday                -292.080616
workingday              312.074401
temp                    110.412263
hum                     -16.327516
windspeed               -37.125197
...                            ...
weekday_4                45.547288
weekday_5               118.710683
weekday_6               249.559919
weathersit_light_rain -1707.279393
weathersit_mist        -500.897901

[757 rows x 1 columns]
Top 3 features contributing to bike demand:
                   Coefficient
dteday_04-07-2018  2389.3