# USA unlikely to medal

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from mlxtend.preprocessing import minmax_scaling
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('../../Combine_Data/men/hb_encoded.csv')
olymp = pd.read_csv('../../Data/cleandata22-23/encoded_m_olympics_hb.csv')
hbnames = pd.read_csv('../../Data/cleandata22-23/encoded_m_olympics_hbnames.csv')
hbolymp = pd.read_csv('../../Data/cleandata22-23/men22_23.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Rank         381 non-null    int64  
 1   D            381 non-null    float64
 2   E            381 non-null    float64
 3   ND           381 non-null    float64
 4   Total        381 non-null    float64
 5   year         381 non-null    int64  
 6   medal        381 non-null    int64  
 7   Name         381 non-null    int64  
 8   Nation       381 non-null    int64  
 9   round_final  381 non-null    bool   
 10  round_qual   381 non-null    bool   
dtypes: bool(2), float64(4), int64(5)
memory usage: 27.7 KB


In [4]:
y = df['medal']
X = df.drop(columns=['medal'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize classifiers
classifiers = {
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "SVM": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB(),
    "Neural Network": MLPClassifier(),
}

results = {}

# Iterate through each classifier
for name, clf in classifiers.items():
    # Create a pipeline with StandardScaler for classifiers that require it
    if name in ["SVM", "K-Nearest Neighbors", "Neural Network"]:
        clf = make_pipeline(StandardScaler(), clf)

    # Fit the model to the training data
    clf.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = clf.predict(X_test)

    # Evaluate the model's performance
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    results[name] = {
        "accuracy": accuracy,
        "classification_report": report,
        "confusion_matrix": conf_matrix,
    }

# Print the results
for name, result in results.items():
    print(f"Classifier: {name}")
    print(f"Accuracy: {result['accuracy']:.4f}")
    print(f"Classification Report:\n{result['classification_report']}")
    print(f"Confusion Matrix:\n{result['confusion_matrix']}\n")


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classifier: Random Forest
Accuracy: 0.9740
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        73
           1       1.00      0.50      0.67         4

    accuracy                           0.97        77
   macro avg       0.99      0.75      0.83        77
weighted avg       0.97      0.97      0.97        77

Confusion Matrix:
[[73  0]
 [ 2  2]]

Classifier: AdaBoost
Accuracy: 0.9870
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        73
           1       1.00      0.75      0.86         4

    accuracy                           0.99        77
   macro avg       0.99      0.88      0.93        77
weighted avg       0.99      0.99      0.99        77

Confusion Matrix:
[[73  0]
 [ 1  3]]

Classifier: SVM
Accuracy: 0.9610
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.0



Classifier: Decision Tree
Accuracy: 0.9870

Classifier: AdaBoost
Accuracy: 0.9870

In [5]:
X_train.head()

Unnamed: 0,Rank,D,E,ND,Total,year,Name,Nation,round_final,round_qual
78,20,4.5,8.7,0.0,13.2,2019,84,26,False,True
265,149,4.9,7.233,0.0,12.133,2019,154,9,False,True
93,35,4.4,8.1,0.0,12.5,2019,98,4,False,True
377,19,5.1,7.4,0.0,24.6,2020,12,74,True,False
272,156,4.4,7.6,0.0,12.0,2019,239,68,False,True


In [8]:
olymp.head()


Unnamed: 0,Rank,D,E,ND,Total,year,Name,Nation,round_AAfinal,round_TeamFinal,round_TeamQual,round_final,round_qual
0,18.0,3.9,8.266,0.0,12.166,2022,234,63,False,False,False,False,True
1,18.0,3.9,8.266,0.0,12.166,2022,234,63,False,False,False,False,True
2,39.0,4.6,6.7,0.0,11.3,2023,234,63,False,False,False,False,True
3,44.0,4.6,8.166,0.0,12.766,2023,234,63,False,False,False,False,True
4,10.0,5.3,8.133,0.0,13.433,2022,512,28,True,False,False,False,False


In [7]:
olymp.rename(columns={'D Score': 'D', 'E Score': 'E',
                     'Pen.': 'ND', 'nation': 'Nation'}, inplace=True)

In [9]:
olymp=olymp.drop(columns=['round_TeamFinal', 'round_AAfinal', 'round_TeamQual'])

In [10]:
y = df['medal']
X = df.drop(columns=['medal'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.12, random_state=42)

# Initialize the AdaBoost classifier
# CHANGING LEARNIN RATE TO 0.31 RESULTS IN 1.000 
clf = AdaBoostClassifier(n_estimators=100, learning_rate=0.41)

# Fit the model to the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the results
print("AdaBoost Classifier")
print(f"Accuracy: {accuracy:.4f}")
print(f"Classification Report:\n{report}")
print(f"Confusion Matrix:\n{conf_matrix}")

AdaBoost Classifier
Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        42
           1       1.00      1.00      1.00         4

    accuracy                           1.00        46
   macro avg       1.00      1.00      1.00        46
weighted avg       1.00      1.00      1.00        46

Confusion Matrix:
[[42  0]
 [ 0  4]]


In [11]:
y_pred = clf.predict(olymp)


In [12]:
ypreds  = pd.Series(y_pred)
olymp['ypred']=ypreds


In [13]:
olymp['ypred'].value_counts()

ypred
0    2338
1       6
Name: count, dtype: int64

In [14]:
filtered_df = olymp[olymp['ypred'] == 1]

In [15]:
filtered_df['Name'].unique()

array([731, 276, 643, 671])

In [16]:
name_encoded_values = [731, 276, 643, 671]

# Filter the DataFrame to get the corresponding "Name" values
result = hbnames.loc[hbnames['Name_encoded'].isin(name_encoded_values), 'Name']


In [18]:
result.unique()

array(['yumin abbadini', 'ilias georgiou', 'shohei kawakami', 'tin srbic'],
      dtype=object)

In [19]:
hbolymp["Name"]=hbolymp["Name"].str.lower()

In [20]:
names_to_filter = ['yumin abbadini', 'ilias georgiou', 'shohei kawakami', 'tin srbic']

# Filter the DataFrame to get rows with the specified "Name" values
result = hbolymp[hbolymp['Name'].isin(names_to_filter)]

In [21]:
result['Country'].value_counts()

Country
ITA    55
CYP    32
JPN    24
CRO    20
Name: count, dtype: int64