In [None]:
# Import the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Assume 'df_clean' is your prepared DataFrame from the EDA.
# If you are starting in a new notebook, you would load and clean it first.

# --- Step 1: Define Features (X) and Target (y) ---
# The target is the 'result' column we engineered.
y = df_clean['result']

# The features are all other columns, EXCEPT for ones that would "leak" the answer.
# We must drop the scores, as they are used to calculate the result.
# We also drop non-predictive columns like date and stadium for this simple model.
features_to_drop = ['result', 'score_home', 'score_away', 'schedule_date', 'stadium']
X = df_clean.drop(columns=features_to_drop)


# --- Step 2: Convert Text Columns to Numbers (One-Hot Encoding) ---
# This will handle the team names and the schedule_week, which is also text.
print("Preparing data... (This may take a moment)")
X_encoded = pd.get_dummies(X)


# --- Step 3: Split the Data into Training and Testing Sets ---
# We use an 80/20 split. random_state ensures we get the same split every time.
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


# --- Step 4: Train the Logistic Regression Model ---
# We'll use max_iter=1000 to ensure the model converges without issues.
model = LogisticRegression(max_iter=1000, random_state=42)
print("Training the model...")
model.fit(X_train, y_train)
print("Model training complete.")


# --- Step 5: Evaluate the Model's Performance ---
# We make predictions on the test data that the model has never seen before.
predictions = model.predict(X_test)

# We calculate the accuracy by comparing the predictions to the real outcomes.
accuracy = accuracy_score(y_test, predictions)

print("\n--- Model Evaluation ---")
print(f"Our model's accuracy is: {accuracy * 100:.2f}%")
print(f"Baseline to beat (Home-Field Advantage): 57.00%")

if accuracy * 100 > 57:
    print("\nSuccess! Our model is better than the baseline.")
else:
    print("\nOur model did not outperform the baseline.")
