In [1]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split


In [2]:

# Load data
data = pd.read_csv('porter_data_1.csv')

data['created_at'] = pd.to_datetime(data['created_at'])
data['actual_delivery_time'] = pd.to_datetime(data['actual_delivery_time'])
data['time_taken_minutes'] = (data['actual_delivery_time'] - data['created_at']).dt.total_seconds() / 60
data = data.drop(columns=['created_at', 'actual_delivery_time'])

# Define features and target
final_features = [
    'market_id', 'store_primary_category', 'order_protocol', 'total_items',
    'num_distinct_items', #'total_onshift_dashers', 'total_busy_dashers',
    'total_outstanding_orders', 'distance'
]
X = data[final_features]
y = data['time_taken_minutes']

# --- Statsmodels OLS ---
X_const = sm.add_constant(X)
ols_model = sm.OLS(y, X_const).fit()
print("\n--- Statsmodels OLS Summary ---")
print(ols_model.summary())

def calculate_vif(X):
    vif = pd.DataFrame()
    vif["Feature"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif

vif_data = calculate_vif(X)
print("\nVIF Values:")
print(vif_data)

# --- Script 2: Scikit-learn Linear Regression ---

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict
y_pred = lr_model.predict(X_test)

# Calculate R2 and Adjusted R2
r2 = r2_score(y_test, y_pred)
n = X_test.shape[0]
p = X_test.shape[1]
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

print("\n--- Scikit-learn Linear Regression ---")
print(f"R-squared: {r2:.4f}")
print(f"Adjusted R-squared: {adjusted_r2:.4f}")

# --- End of Scripts ---


--- Statsmodels OLS Summary ---
                            OLS Regression Results                            
Dep. Variable:     time_taken_minutes   R-squared:                       0.779
Model:                            OLS   Adj. R-squared:                  0.779
Method:                 Least Squares   F-statistic:                 6.903e+04
Date:                Tue, 29 Apr 2025   Prob (F-statistic):               0.00
Time:                        10:12:01   Log-Likelihood:            -5.0905e+05
No. Observations:              175777   AIC:                         1.018e+06
Df Residuals:                  175767   BIC:                         1.018e+06
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------