In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as SklearnLinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import zipfile
import os


In [31]:
def unzip_file(zip_path, extract_to):
    """
    Unzips a file to the specified directory.

    :param zip_path: Path to the zip file.
    :param extract_to: Directory where files should be extracted.
    """
    os.makedirs(extract_to, exist_ok=True)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
        print(f"Extracted all files to {extract_to}")

bike_sharing_zip = '/content/bike+sharing+dataset.zip'
dataset = 'datasets'
unzip_file(bike_sharing_zip, dataset)


Extracted all files to datasets


In [32]:
# Load datasets
day_df = pd.read_csv('/content/extracted_files/day.csv')
hour_df = pd.read_csv('/content/extracted_files/hour.csv')


In [33]:
# Extract features and target variable
X = hour_df[['temp', 'hum', 'hr', 'weekday']]
y = hour_df['cnt']

# Add interaction features
X['temp_hum_interaction'] = X['temp'] * X['hum']
X['hour_temp_interaction'] = X['hr'] * X['temp']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['temp_hum_interaction'] = X['temp'] * X['hum']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['hour_temp_interaction'] = X['hr'] * X['temp']


In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [35]:
# Initialize and train the scikit-learn LinearRegression model
sklearn_model = SklearnLinearRegression()
sklearn_model.fit(X_train, y_train)


In [36]:
# Define the pipeline with StandardScaler and LinearRegression
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('model', SklearnLinearRegression())  # Linear Regression model
])

# Train the model using the pipeline
pipeline.fit(X_train, y_train)


In [37]:
# Make predictions
y_pred_pipeline = pipeline.predict(X_test)

# Evaluate performance
mse_pipeline = mean_squared_error(y_test, y_pred_pipeline)
r2_pipeline = r2_score(y_test, y_pred_pipeline)

print(f"Mean Squared Error (Pipeline): {mse_pipeline}")
print(f"R-squared (Pipeline): {r2_pipeline}")


Mean Squared Error (Pipeline): 20286.221077462953
R-squared (Pipeline): 0.3593570959203928


In [38]:
# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform training data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Add intercept term
X_train_with_intercept = np.c_[np.ones(X_train.shape[0]), X_train_scaled]
X_test_with_intercept = np.c_[np.ones(X_test.shape[0]), X_test_scaled]


In [39]:
# Initialize parameters
m, n = X_train_with_intercept.shape
theta = np.zeros(n)

# Cost function
def compute_cost(X, y, theta):
    m = len(y)
    predictions = X.dot(theta)
    cost = (1 / (2 * m)) * np.sum(np.square(predictions - y))
    return cost

# Gradient Descent
def gradient_descent(X, y, theta, learning_rate, iterations):
    m = len(y)
    cost_history = np.zeros(iterations)

    for i in range(iterations):
        predictions = X.dot(theta)
        theta -= (1 / m) * learning_rate * (X.T.dot(predictions - y))
        cost_history[i] = compute_cost(X, y, theta)

    return theta, cost_history

# Set hyperparameters
learning_rate = 0.01
iterations = 1000

# Train model
theta, cost_history = gradient_descent(X_train_with_intercept, y_train, theta, learning_rate, iterations)


In [40]:
# Make predictions
y_pred_scratch = X_test_with_intercept.dot(theta)

# Evaluate performance
mse_scratch = mean_squared_error(y_test, y_pred_scratch)

# R-squared Calculation
y_mean = np.mean(y_test)
ss_total = np.sum((y_test - y_mean) ** 2)
ss_residual = np.sum((y_test - y_pred_scratch) ** 2)
r2_scratch = 1 - (ss_residual / ss_total)

print(f"Mean Squared Error (from scratch): {mse_scratch}")
print(f"R-squared (from scratch): {r2_scratch}")


Mean Squared Error (from scratch): 20372.635007063105
R-squared (from scratch): 0.35662812680383926
