In [2]:
# app.py
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.impute import SimpleImputer

# -----------------------------
# Load dataset
# -----------------------------
df = pd.read_csv("Dataset.csv")
df.replace("?", np.nan, inplace=True)

# -----------------------------
# Define features and target
# -----------------------------
y = df['cnt']
X = df.drop(columns=['cnt', 'instant', 'dteday', 'casual', 'registered'])

# -----------------------------
# Train-test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# Column types
# -----------------------------
numeric_cols = ['temp', 'atemp', 'hum', 'windspeed', 'yr', 'mnth', 'hr', 'weekday']
categorical_cols = ['season', 'holiday', 'workingday', 'weathersit']

# -----------------------------
# Preprocessing pipelines
# -----------------------------
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_pipeline, numeric_cols),
    ("cat", categorical_pipeline, categorical_cols)
])

# -----------------------------
# Full model pipeline
# -----------------------------
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(random_state=42))
])

# -----------------------------
# Train model
# -----------------------------
print("Training model...")
model.fit(X_train, y_train)

# -----------------------------
# Save trained model for deployment
# -----------------------------
joblib.dump(model, "model.pkl")
print("Model saved as 'model.pkl'.")

# -----------------------------
# Load model and make predictions
# -----------------------------
print("Loading model and testing predictions...")
loaded_model = joblib.load("model.pkl")
y_pred = loaded_model.predict(X_test)

# -----------------------------
# Model evaluation
# -----------------------------
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("\nModel Performance on Test Data:")
print("R² Score:", r2)
print("MAE:", mae)
print("RMSE:", rmse)

# -----------------------------
# Example single prediction
# -----------------------------
example_input = X_test.iloc[0:1]  # take the first row from test set
predicted_count = loaded_model.predict(example_input)[0]
print("\nExample Prediction for first test row:", int(predicted_count))


Training model...
Model saved as 'model.pkl'.
Loading model and testing predictions...

Model Performance on Test Data:
R² Score: 0.9432398187158286
MAE: 25.026498177982354
RMSE: 42.394981871621766

Example Prediction for first test row: 376
