# 1.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the data from the CSV file
data = pd.read_csv('./materials/athletes.csv')

In [3]:
# Check for missing values in each column
# data.head()
print(data.isnull().sum())

id               0
name             0
nationality      0
sex              0
dob              1
height         330
weight         659
sport            0
gold             0
silver           0
bronze           0
dtype: int64


In [4]:
# Drop rows where 'dob' is missing
data.dropna(subset=['dob'], inplace=True)

# Fill missing 'height' values with the mean height and assign it back to the 'height' column
data['height'] = data['height'].fillna(data['height'].mean())

# Fill missing 'weight' values with the median weight and assign it back to the 'weight' column
data['weight'] = data['weight'].fillna(data['weight'].median())

In [5]:
# Convert categorical variables to numeric using one-hot encoding
data = pd.get_dummies(data)

In [6]:
# Display the first few rows of the processed dataset to verify the changes
print(data.head())

          id  height  weight  gold  silver  bronze  name_A Jesus Garcia  \
0  736041664    1.72    64.0     0       0       0                 True   
1  532037425    1.68    56.0     0       0       0                False   
2  435962603    1.98    79.0     0       0       1                False   
3  521041435    1.83    80.0     0       0       0                False   
4   33922579    1.81    71.0     0       0       0                False   

   name_A Lam Shin  name_Aaron Brown  name_Aaron Cook  ...  \
0            False             False            False  ...   
1             True             False            False  ...   
2            False              True            False  ...   
3            False             False             True  ...   
4            False             False            False  ...   

   sport_rugby sevens  sport_sailing  sport_shooting  sport_table tennis  \
0               False          False           False               False   
1               False   

# 2.

In [7]:
# Create a new column 'won_medal' to indicate whether an athlete has won at least one medal
data['won_medal'] = (data['gold'] + data['silver'] + data['bronze'] > 0).astype(int)

In [8]:
# Features (excluding the target column and other non-predictive columns like 'id' and 'name')
X = data.drop(['won_medal', 'id', 'gold', 'silver', 'bronze'], axis=1)

# New target column
y = data['won_medal']

In [9]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Verify the split (train & test)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

Training set size: 9229 samples
Test set size: 2308 samples


# 3.

In [11]:
# Get logistic regression model to teach
from sklearn.linear_model import LogisticRegression

In [12]:
# Initialize the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)  # Increasing max_iter for convergence, adjust as needed

In [13]:
# Train the model with the training data
log_reg.fit(X_train, y_train)

In [14]:
# Make predictions on the test data
y_pred = log_reg.predict(X_test)

In [15]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Calculate the ROC-AUC score
roc_auc = roc_auc_score(y_test, log_reg.predict_proba(X_test)[:, 1])
print(f"ROC-AUC: {roc_auc}")

Accuracy: 0.83578856152513
ROC-AUC: 0.7550296136128607


![ROC-AUC](https://miro.medium.com/v2/resize:fit:1358/1*Bgc9QOjhnL70g2SQxyj6hQ.png)