In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [8]:
df = pd.read_csv(r"C:\Users\saksh\OneDrive\Documents\Fall 2023\ML\MS Course Project\ML\MS Course Project\cleaned_medal_data.csv")

In [9]:
# Preprocess the data
# Convert categorical variables to numerical using Label Encoding
le = LabelEncoder()
df['Medal'] = le.fit_transform(df['Medal'])
df['region'] = le.fit_transform(df['region'])

In [10]:
df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal,region,notes,Bronze,Gold,Silver
0,17,Paavo Johannes Aaltonen,M,28.0,175.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Individual All-Around,0,36,,True,False,False
1,17,Paavo Johannes Aaltonen,M,28.0,175.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Team All-Around,1,36,,False,True,False
2,17,Paavo Johannes Aaltonen,M,28.0,175.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Horse Vault,1,36,,False,True,False
3,17,Paavo Johannes Aaltonen,M,28.0,175.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Pommelled Horse,1,36,,False,True,False
4,17,Paavo Johannes Aaltonen,M,32.0,175.0,64.0,Finland,FIN,1952 Summer,1952,Summer,Helsinki,Gymnastics,Gymnastics Men's Team All-Around,0,36,,True,False,False


In [16]:
# Select relevant features
features = ['Age', 'Height', 'Weight', 'Year', 'region']

X = df[features]
y = df['Medal']

In [17]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=45)

In [18]:
# Create an SVM model
model = SVC(kernel='linear')

In [19]:
# Train the model
model.fit(X_train, y_train)

In [20]:
# Make predictions
predictions = model.predict(X_test)

In [21]:
# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f'Model Accuracy: {accuracy * 100:.2f}%')

Model Accuracy: 36.56%


In [26]:
# Calculate mean squared error (MSE)
mse = mean_squared_error(y_test, predictions)
print(f'Mean Square Error: {mse:.2f}')


Mean Square Error: 1.11


In [29]:
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse:.2f}')

Root Mean Squared Error: 1.05


In [30]:
# Print classification report
print('\nClassification Report:')
print(classification_report(y_test, predictions))


Classification Report:
              precision    recall  f1-score   support

           0       0.36      0.52      0.43      4299
           1       0.37      0.56      0.44      4302
           2       0.47      0.00      0.00      4077

    accuracy                           0.37     12678
   macro avg       0.40      0.36      0.29     12678
weighted avg       0.40      0.37      0.30     12678



In [31]:
# Print confusion matrix
print('\nConfusion Matrix:')
print(confusion_matrix(y_test, predictions))


Confusion Matrix:
[[2221 2073    5]
 [1892 2406    4]
 [2002 2067    8]]


In [33]:
new_data_encoded = pd.get_dummies(df[features])

In [34]:
N_predictions = model.predict(new_data_encoded)

In [35]:
N_predictions.tolist()

[0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,


In [36]:
predictions_with_countries = pd.DataFrame({
    'Country': df['region'],  # Replace 'Country' with the actual column name in your 'new_data' DataFrame
    'Prediction': N_predictions
})

In [37]:
# Convert predictions to a list
predictions_list = predictions_with_countries.values.tolist()