## Resources:
- https://thegymter.net/nation-database/
- https://www.kaggle.com/code/jlove5/olympic-data-women-s-gymnastics/notebook
- https://themedalcount.com/data-hub/

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


import random
from itertools import combinations

In [2]:
df = pd.read_csv('data_2017_2021.csv')

### Feature Engineering & Clean Data:

    - drop duplicates
    - combine first and last names and drop originals
    - fill missing scores
    - create total score (combining all scores)
    - create gold medal
    

In [3]:
df.drop_duplicates(inplace=True)
df['Name'] = df['FirstName'] + " " + df['LastName']
df = df.drop(columns=["LastName", "FirstName"])
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
fill_columns = ['Rank', 'D_Score', 'E_Score', 'Score']
df[fill_columns] = df[fill_columns].fillna(0)
df['Total_Score'] = df['D_Score'] + df['E_Score'] - df['Penalty']
gold_medal_count = df[df['Rank'] == 1].groupby(['Name']).size().reset_index(name='Gold_Medals')
df = df.merge(gold_medal_count, on=['Name'], how='left')
df['Gold_Medals'] = df['Gold_Medals'].fillna(0)

### Define team composition rules


In [4]:
team_size = 5
max_athletes_per_country = 3

### Select athletes from the USA


In [5]:
usa_athletes = df[df['Country'] == 'USA']


### Dynamic programming approach (Knapsack Problem)


In [6]:
def knapsack(items, limit):
    n = len(items)
    dp = np.zeros((n + 1, limit + 1))

    for i in range(1, n + 1):
        for w in range(1, limit + 1):
            name, score = items[i - 1]
            if score <= w:
                dp[i][w] = max(dp[i - 1][w], dp[i - 1][w - score] + score)
            else:
                dp[i][w] = dp[i - 1][w]

    return dp[n][limit]

### Create a list of athlete scores as tuples (name, score, country)


In [7]:
athlete_scores = [(name, score, country) for name, score, country in usa_athletes[['Name', 'Total_Score', 'Country']].values]


### Define optimization criteria


In [8]:
optimization_criteria = ['Total_Score', 'Gold_Medals']

### Create a function to select the best team based on criteria


In [9]:
def select_best_team(df, criteria, num_athletes):
    if criteria == 'Total_Score':
        best_team = df.nlargest(num_athletes, 'Total_Score')
    elif criteria == 'Gold_Medals':
        best_team = df.nlargest(num_athletes, 'Gold_Medals')
    return best_team

### Define the number of events


In [10]:
num_events = 4

### Store results for different team compositions and criteria


In [11]:
results = []

### Iterate through optimization criteria


In [12]:
for criteria in optimization_criteria:
    
    # Select athletes based on criteria and composition rules
    selected_team = select_best_team(df, criteria, num_athletes=team_size)
    
    # Simulate medal count for each event
    event_medal_count = {event: {'Gold': 0, 'Silver': 0,
                                 'Bronze': 0} for event in range(1, num_events + 1)}
    for event in range(1, num_events + 1):
        event_scores = {}
        for gymnast, _, _ in selected_team[['Name', 'Total_Score', 
                                            'Country']].values:
            
            # Simulate a random score between 13 and 16
            score = random.uniform(13, 16)
            event_scores[gymnast] = score
        
        sorted_gymnasts = sorted(event_scores.items(), key=lambda x: x[1], reverse=True)
        
        # Distribute medals
        for i, (gymnast, score) in enumerate(sorted_gymnasts):
            if i == 0:
                medal = 'Gold'
            elif i == 1:
                medal = 'Silver'
            elif i == 2:
                medal = 'Bronze'
            else:
                break
            
            event_medal_count[event][medal] += 1
    
    # Store the results for this criteria and team composition
    results.append({'Criteria': criteria, 'Team': selected_team,
                    'Medal_Count': event_medal_count})


### Analyze and print the results


In [13]:

for result in results:
    print(f"Team Composition based on {result['Criteria']}:\n")
    print(result['Team'][['Name', 'Country', 'Total_Score', 'Gold_Medals']])
    print("\nMedal Count in Different Events:")
    for event, medals in result['Medal_Count'].items():
        print(f"Event {event}: Gold - {medals['Gold']}, Silver - {medals['Silver']}, Bronze - {medals['Bronze']}")
    print("\n")


Team Composition based on Total_Score:

              Name Country  Total_Score  Gold_Medals
36  Rebeca ANDRADE     BRA       15.300          4.0
39  Rebeca ANDRADE     BRA       15.166          4.0
62    Simone BILES     USA       15.100          2.0
60    Simone BILES     USA       14.966          2.0
61    Simone BILES     USA       14.966          2.0

Medal Count in Different Events:
Event 1: Gold - 1, Silver - 1, Bronze - 0
Event 2: Gold - 1, Silver - 1, Bronze - 0
Event 3: Gold - 1, Silver - 1, Bronze - 0
Event 4: Gold - 1, Silver - 1, Bronze - 0


Team Composition based on Gold_Medals:

              Name Country  Total_Score  Gold_Medals
26  Rebeca ANDRADE     BRA          NaN          4.0
27  Rebeca ANDRADE     BRA          NaN          4.0
28  Rebeca ANDRADE     BRA          NaN          4.0
29  Rebeca ANDRADE     BRA       13.666          4.0
30  Rebeca ANDRADE     BRA          NaN          4.0

Medal Count in Different Events:
Event 1: Gold - 1, Silver - 0, Bronze - 0
Even

In [14]:

    
# Create a list of dictionaries to store the results
final_results = []
for result in results:
    criteria = result['Criteria']
    for event, medals in result['Medal_Count'].items():
        for medal, count in medals.items():
            final_results.append({'Criteria': criteria, 'Event': event, 'Medal': medal, 'Count': count})

# Convert the list of dictionaries into a DataFrame
results_df = pd.DataFrame(final_results)

# Display the results DataFrame
print(results_df)


       Criteria  Event   Medal  Count
0   Total_Score      1    Gold      1
1   Total_Score      1  Silver      1
2   Total_Score      1  Bronze      0
3   Total_Score      2    Gold      1
4   Total_Score      2  Silver      1
5   Total_Score      2  Bronze      0
6   Total_Score      3    Gold      1
7   Total_Score      3  Silver      1
8   Total_Score      3  Bronze      0
9   Total_Score      4    Gold      1
10  Total_Score      4  Silver      1
11  Total_Score      4  Bronze      0
12  Gold_Medals      1    Gold      1
13  Gold_Medals      1  Silver      0
14  Gold_Medals      1  Bronze      0
15  Gold_Medals      2    Gold      1
16  Gold_Medals      2  Silver      0
17  Gold_Medals      2  Bronze      0
18  Gold_Medals      3    Gold      1
19  Gold_Medals      3  Silver      0
20  Gold_Medals      3  Bronze      0
21  Gold_Medals      4    Gold      1
22  Gold_Medals      4  Silver      0
23  Gold_Medals      4  Bronze      0


In [15]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Criteria  24 non-null     object
 1   Event     24 non-null     int64 
 2   Medal     24 non-null     object
 3   Count     24 non-null     int64 
dtypes: int64(2), object(2)
memory usage: 896.0+ bytes


In [16]:
results_df.corr()

Unnamed: 0,Event,Count
Event,1.0,-1.075765e-16
Count,-1.075765e-16,1.0
