In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Task 1: Feature Engineering
def preprocess_data(FloridaBikeRentals):
    df = pd.read_csv('FloridaBikeRentals.csv')
    # Rename columns
    df.columns = ['Date', 'Rented_Bike_Count', 'Hour', 'Temperature', 'Humidity', 'Wind_Speed',
                  'Visibility', 'Dew_Point', 'Solar_Radiation', 'Rainfall', 'Snowfall',
                  'Seasons', 'Holiday', 'Functioning_Day']

    # Create one interaction feature
    df['Temp_Humidity'] = df['Temperature'] * df['Humidity']

    # Encode categorical variables
    df = pd.get_dummies(df, columns=['Seasons', 'Hour', 'Holiday', 'Functioning_Day'], drop_first=True)

    # Handle missing values for numerical columns
    numerical_cols = ['Temperature', 'Humidity', 'Wind_Speed', 'Visibility', 'Dew_Point',
                     'Solar_Radiation', 'Rainfall', 'Snowfall', 'Temp_Humidity']
    df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

    # Features and target
    X = df.drop(['Rented_Bike_Count', 'Date'], axis=1)
    y = df['Rented_Bike_Count']

    # Scale numerical features
    scaler = StandardScaler()
    X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

    # Save processed dataset
    pd.concat([X, y], axis=1).to_csv('bike_rental_features.csv', index=False)

    return X, y, scaler

# Task 2 & 3: Model Building (Linear and Polynomial)
def train_models(X, y, scaler):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Linear Regression
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Polynomial features
    numerical_cols = ['Temperature', 'Humidity', 'Wind_Speed', 'Temp_Humidity']
    poly = PolynomialFeatures(degree=2, include_bias=False)
    X_train_poly = poly.fit_transform(X_train[numerical_cols])
    X_test_poly = poly.transform(X_test[numerical_cols])

    poly_model = LinearRegression()
    poly_model.fit(X_train_poly, y_train)
    y_pred_poly = poly_model.predict(X_test_poly)

    results = {
        'Linear': {
            'MAE': mean_absolute_error(y_test, y_pred),
            'MSE': mean_squared_error(y_test, y_pred),
            'R2': r2_score(y_test, y_pred),
            'model': model
        },
        'Poly Linear': {
            'MAE': mean_absolute_error(y_test, y_pred_poly),
            'MSE': mean_squared_error(y_test, y_pred_poly),
            'R2': r2_score(y_test, y_pred_poly),
            'model': poly_model
        }
    }

    # Save best model
    best_model_name = min(results, key=lambda x: results[x]['MSE'])
    joblib.dump(results[best_model_name]['model'], 'best_bike_rental_model.pkl')

    return results, X_train, X_test, y_train, y_test

# Task 4: Model Evaluation
def evaluate_models(results, X, y):
    validation_results = {}
    for name, result in results.items():
        model = result['model']
        cv_mse = -cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error').mean()
        validation_results[name] = {
            'CV_MSE': cv_mse,
            'Test_MAE': result['MAE'],
            'Test_MSE': result['MSE'],
            'Test_R2': result['R2']
        }
    return validation_results
# Task 5: Reporting
def generate_report(results, validation_results, X):
    report = "# Bike Rental Prediction Report\n\n"
    report += "## Model Performance\n"
    for name, metrics in validation_results.items():
        report += f"### {name}\n"
        report += f"- MAE: {metrics['Test_MAE']:.2f}\n"
        report += f"- MSE: {metrics['Test_MSE']:.2f}\n"
        report += f"- R²: {metrics['Test_R2']:.2f}\n"
        report += f"- CV MSE: {metrics['CV_MSE']:.2f}\n\n"

    # Feature importance (for Linear Regression)
    report += "## Feature Importance\n"
    linear_model = results['Linear']['model']
    feature_names = X.columns
    coefficients = linear_model.coef_
    importance = sorted(zip(feature_names, coefficients), key=lambda x: abs(x[1]), reverse=True)[:3]
    for feature, coef in importance:
        report += f"- {feature}: {coef:.2f}\n"

    report += "\n## Insights\n"
    report += "- Temperature and humidity are key drivers.\n"
    report += "- Hour of day impacts rental demand.\n"

    report += "\n## Recommendations\n"
    report += "- Use weather forecasts for better predictions.\n"
    report += "- Add event data for demand spikes.\n"

    with open('bike_rental_report.md', 'w') as f:
        f.write(report)
    return report


# Main execution
def main():
    file_path = 'bike_rental_dataset.csv'
    X, y, scaler = preprocess_data(file_path)
    results, X_train, X_test, y_train, y_test = train_models(X, y, scaler)
    validation_results = evaluate_models(results, X, y)
    report = generate_report(results, validation_results, X)
    print(report)

if __name__ == '__main__':
    main()

# Bike Rental Prediction Report

## Model Performance
### Linear
- MAE: 278.73
- MSE: 137415.81
- R²: 0.67
- CV MSE: 218944.63

### Poly Linear
- MAE: 358.76
- MSE: 240401.18
- R²: 0.42
- CV MSE: 218944.63

## Feature Importance
- Functioning_Day_Yes: 946.32
- Hour_18: 764.09
- Hour_19: 517.19

## Insights
- Temperature and humidity are key drivers.
- Hour of day impacts rental demand.

## Recommendations
- Use weather forecasts for better predictions.
- Add event data for demand spikes.



#Bike Rental Prediction Summary

Model Performance:

*Linear Regression: MAE: 278.73, MSE: 137,415.81, R²: 0.67, CV MSE: 218,944.63

*Polynomial Linear: MAE: 358.76, MSE: 240,401.18, R²: 0.42, CV MSE: 218,944.63

Key Findings:

*Linear model outperforms polynomial model (lower MSE, higher R²).

*Functioning day and peak hours (6 PM, 7 PM) strongly drive rentals.

*Temperature and humidity significantly influence demand.

Recommendations:

*Use real-time weather data for accurate predictions.

*Incorporate event data to capture demand spikes.