## Part 4: Build a Regression Model

In [None]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_csv(r'C:\Users\Admin\Documents\LighthouseLabs\combined_data.csv')

# 'number_of_bikes' is the dependent variable
# and 'Distance_y', 'Rating_y', 'Review Count', 'Price_y' are the independent variables
X = df[['Distance_y', 'Rating_y', 'Review Count', 'Price_y']]
y = df['number_of_bikes']

# Spliting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fitting an OLS model
model = sm.OLS(y_train, sm.add_constant(X_train)).fit()

# Print out the statistics
model.summary()

## Model Output and Interpretation

In [None]:
# Displaying the regression results
print(model.summary())

# Predictions
predictions = model.predict(sm.add_constant(X_test))
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
print(comparison.head())

## Stretch: Turn the Regression Model into a Classification Model

In [None]:
# converting the continuous output of the regression into a binary classification
# defining a threshold above which we classify a bike station as 'high availability'

threshold = 15  # This is an arbitrary threshold for illustration
df['high_availability'] = (df['number_of_bikes'] >= threshold).astype(int)

# Now 'high_availability' can be used as the dependent variable in a classification model
# ... (rest of the classification model code)