In [9]:
import pandas as pd
import statsmodels.api as sm

# Load the combined data
merged_data = pd.read_csv('mergeddata.csv')

# Select relevant columns for modeling
selected_columns = [
    "Shortest Yelp Distance from Station",  # Updated column name
    "Empty_slots"  # Updated column name
]

# Create a subset of the DataFrame with the selected columns
data = merged_data[selected_columns]

# Data Preprocessing
# Make a copy of the data to avoid modifying the original DataFrame
data = data.copy()

# Check for missing values and drop rows with missing values
data.dropna(inplace=True)

# Split the data into independent variables (X) and the target variable (y)
X = data[["Shortest Yelp Distance from Station"]]  # Updated column name
y = data["Empty_slots"]  # Updated column name

# Add a constant term to the independent variables for the intercept
X = sm.add_constant(X)

# Create and train a linear regression model using statsmodels
model = sm.OLS(y, X).fit()

# Print a summary of the regression model
print(model.summary())




                            OLS Regression Results                            
Dep. Variable:            Empty_slots   R-squared:                       0.092
Model:                            OLS   Adj. R-squared:                  0.064
Method:                 Least Squares   F-statistic:                     3.243
Date:                Thu, 07 Sep 2023   Prob (F-statistic):             0.0811
Time:                        01:14:44   Log-Likelihood:                -122.76
No. Observations:                  34   AIC:                             249.5
Df Residuals:                      32   BIC:                             252.6
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
co

The R-squared value of the model is 0.092, indicating that the model explains a limited amount of the variability in the data. The Adj. R-squared (Adj. R²) is 0.064, suggesting that adding more relevant variables or exploring other factors may be necessary to improve the model's explanatory power. Additionally, the p-value of 0.0811 implies that the model is not highly statistically significant, indicating that "Shortest Yelp Distance from Station" may not be a strong predictor of the number of empty slots.

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load your dataset (mergeddata.csv) here
merged_df_citybik_fsq_yelp = pd.read_csv('mergeddata.csv')

# Define the classes: Class 1 (Low Availability) and Class 2 (High Availability)
merged_df_citybik_fsq_yelp['Availability'] = (merged_df_citybik_fsq_yelp['Empty_slots'] <= 5).astype(int)

# Create a copy of the DataFrame to avoid the SettingWithCopyWarning
data = merged_df_citybik_fsq_yelp.copy()

# Select relevant columns for modeling
selected_columns = ["Shortest Yelp Distance from Station", "Availability"]
data = data[selected_columns]

# Data Preprocessing
data.dropna(inplace=True)

# Split the data into training and testing sets
X = data[["Shortest Yelp Distance from Station"]]
y = data["Availability"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the classification model
accuracy = accuracy_score(y_test, y_pred)

# Set zero_division=1 to avoid UndefinedMetricWarning
classification_rep = classification_report(y_test, y_pred, zero_division=1)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

# Now use this trained classification model for prediction
# For example:
new_data = pd.DataFrame({
    "Shortest Yelp Distance from Station": [25],
})
predicted_class = model.predict(new_data)
print("Predicted Class:", predicted_class)



Accuracy: 0.5714285714285714
Classification Report:
               precision    recall  f1-score   support

           0       0.57      1.00      0.73         4
           1       1.00      0.00      0.00         3

    accuracy                           0.57         7
   macro avg       0.79      0.50      0.36         7
weighted avg       0.76      0.57      0.42         7

Predicted Class: [0]
