In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Load data
df = pd.read_csv("../data/cleaned_olympics_data.csv")


In [2]:
# Convert Medal to binary: 1 = Medal, 0 = No Medal (though all rows here have medals)
# We'll instead predict type of medal: Gold/Silver/Bronze

# Encode Medal as target
le_medal = LabelEncoder()
df['Medal_encoded'] = le_medal.fit_transform(df['Medal'])  # Gold=1, Bronze=0, Silver=2 (order may vary)

# Encode Gender, Country, Sport
df['Gender_encoded'] = LabelEncoder().fit_transform(df['Gender'])
df['Country_encoded'] = LabelEncoder().fit_transform(df['Country'])
df['Sport_encoded'] = LabelEncoder().fit_transform(df['Sport'])


In [3]:
# Define features and target
X = df[['Gender_encoded', 'Country_encoded', 'Sport_encoded']]
y = df['Medal_encoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
# Initialize and train model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)


In [5]:
# Confusion Matrix and Classification Report
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le_medal.classes_))


Confusion Matrix:
 [[640 371  16]
 [591 433  19]
 [557 418  19]]

Classification Report:
               precision    recall  f1-score   support

      Bronze       0.36      0.62      0.45      1027
        Gold       0.35      0.42      0.38      1043
      Silver       0.35      0.02      0.04       994

    accuracy                           0.36      3064
   macro avg       0.35      0.35      0.29      3064
weighted avg       0.35      0.36      0.29      3064

