# [120 Years of Olympics](https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results)

In [1]:
import pandas as pd
from collections import defaultdict

In [2]:
athelete_events_loc = "./120-years-of-olympic-history-athletes-and-results/athlete_events.csv"
athelete_medals_loc = "./120-years-of-olympic-history-athletes-and-results/athlele_medals.csv"
regions_loc = "./120-years-of-olympic-history-athletes-and-results/noc_regions.csv"

The file athlete_events.csv contains 271116 rows and 15 columns. Each row corresponds to an individual athlete competing in an individual Olympic event (athlete-events).

The file noc_regions.csv contains key value pairs for each National Olympic Committee corresponding to the NOC column in athlete_events.csv. NOCs are responsible for organizing their people's participation in the Olympic Games. They may nominate cities within their respective areas as candidates for future Olympic Games. NOCs also promote the development of athletes and training of coaches and officials at a national level within their geographies.

In [3]:
athevent_df = pd.read_csv(athelete_events_loc)
regions_df = pd.read_csv(regions_loc)

ID - Unique number for each athlete

Name - Athlete's name

Sex - M or F

Age - Integer

Height - In centimeters

Weight - In kilograms

Team - Team name

NOC - National Olympic Committee 3-letter code

Games - Year and season

Year - Integer

Season - Summer or Winter

City - Host city

Sport - Sport

Event - Event

Medal - Gold, Silver, Bronze, or NA

In [4]:
athevent_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [5]:
athevent_df.describe()

Unnamed: 0,ID,Age,Height,Weight,Year
count,271116.0,261642.0,210945.0,208241.0,271116.0
mean,68248.954396,25.556898,175.33897,70.702393,1978.37848
std,39022.286345,6.393561,10.518462,14.34802,29.877632
min,1.0,10.0,127.0,25.0,1896.0
25%,34643.0,21.0,168.0,60.0,1960.0
50%,68205.0,24.0,175.0,70.0,1988.0
75%,102097.25,28.0,183.0,79.0,2002.0
max,135571.0,97.0,226.0,214.0,2016.0


In [6]:
# top ten countries with medals
athevent_df["Team"].value_counts()[:10]

United States    17847
France           11988
Great Britain    11404
Italy            10260
Germany           9326
Canada            9279
Japan             8289
Sweden            8052
Australia         7513
Hungary           6547
Name: Team, dtype: int64

In [7]:
# top ten National Olympic Committees with medals
athevent_df["NOC"].value_counts()[:10]

USA    18853
FRA    12758
GBR    12256
ITA    10715
GER     9830
CAN     9733
JPN     8444
SWE     8339
AUS     7638
HUN     6607
Name: NOC, dtype: int64

In [8]:
# the number of teams to have ever competed in the olympics
athevent_df["Team"].nunique()

1184

In [9]:
# number of sports played ever
print "The number of sports that have been recognized by the Olympics: " + str(athevent_df["Sport"].nunique())

The number of sports that have been recognized by the Olympics: 66


In [10]:
# number of events played ever
print "The number of unique events at the Olympics: " + str(athevent_df["Event"].nunique())

The number of unique events at the Olympics: 765


In [None]:
sport_events = defaultdict(set)
for index, row in athevent_df.iterrows():
    sport_events[row['Sport']].add(row['Event'])

In [33]:
sport_events

defaultdict(set,
            {'Aeronautics': {'Aeronautics Mixed Aeronautics'},
             'Alpine Skiing': {"Alpine Skiing Men's Combined",
              "Alpine Skiing Men's Downhill",
              "Alpine Skiing Men's Giant Slalom",
              "Alpine Skiing Men's Slalom",
              "Alpine Skiing Men's Super G",
              "Alpine Skiing Women's Combined",
              "Alpine Skiing Women's Downhill",
              "Alpine Skiing Women's Giant Slalom",
              "Alpine Skiing Women's Slalom",
              "Alpine Skiing Women's Super G"},
             'Alpinism': {'Alpinism Mixed Alpinism'},
             'Archery': {"Archery Men's Au Chapelet, 33 metres",
              "Archery Men's Au Chapelet, 50 metres",
              "Archery Men's Au Cordon Dore, 33 metres",
              "Archery Men's Au Cordon Dore, 50 metres",
              "Archery Men's Championnat Du Monde",
              "Archery Men's Continental Style",
              "Archery Men's Double Americ

In [14]:
# number of different cities that have hosted the olympic
print "The number of cities that have hosted the Olympics: " + str(athevent_df["City"].nunique())

The number of cities that have hosted the Olympics: 42


In [17]:
# retain only medal win entries
athevent_df.dropna(subset=["Medal"]).to_csv(athelete_medals_loc)

In [36]:
winners = athevent_df.dropna(subset=["Medal"])
winners[winners["Name"] == "Heikki Ilmari Savolainen"].sort_values("Year")

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
211674,106296,Heikki Ilmari Savolainen,M,20.0,172.0,64.0,Finland,FIN,1928 Summer,1928,Summer,Amsterdam,Gymnastics,Gymnastics Men's Pommelled Horse,Bronze
211675,106296,Heikki Ilmari Savolainen,M,24.0,172.0,64.0,Finland,FIN,1932 Summer,1932,Summer,Los Angeles,Gymnastics,Gymnastics Men's Individual All-Around,Bronze
211676,106296,Heikki Ilmari Savolainen,M,24.0,172.0,64.0,Finland,FIN,1932 Summer,1932,Summer,Los Angeles,Gymnastics,Gymnastics Men's Team All-Around,Bronze
211679,106296,Heikki Ilmari Savolainen,M,24.0,172.0,64.0,Finland,FIN,1932 Summer,1932,Summer,Los Angeles,Gymnastics,Gymnastics Men's Parallel Bars,Bronze
211680,106296,Heikki Ilmari Savolainen,M,24.0,172.0,64.0,Finland,FIN,1932 Summer,1932,Summer,Los Angeles,Gymnastics,Gymnastics Men's Horizontal Bar,Silver
211684,106296,Heikki Ilmari Savolainen,M,28.0,172.0,64.0,Finland,FIN,1936 Summer,1936,Summer,Berlin,Gymnastics,Gymnastics Men's Team All-Around,Bronze
211692,106296,Heikki Ilmari Savolainen,M,40.0,172.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Team All-Around,Gold
211698,106296,Heikki Ilmari Savolainen,M,40.0,172.0,64.0,Finland,FIN,1948 Summer,1948,Summer,London,Gymnastics,Gymnastics Men's Pommelled Horse,Gold
211700,106296,Heikki Ilmari Savolainen,M,44.0,172.0,64.0,Finland,FIN,1952 Summer,1952,Summer,Helsinki,Gymnastics,Gymnastics Men's Team All-Around,Bronze


In [29]:
gold_winners = (list(athevent_df[athevent_df["Medal"] == "Gold"]["Name"]))
gwinners_df = athevent_df[athevent_df["Name"].isin(gold_winners)]

In [31]:
gwinners_df.sort_values("Name")

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
4015,2316,A. Albert,M,,,,Union des Socits Franais de Sports Athletiques,FRA,1900 Summer,1900,Summer,Paris,Rugby,Rugby Men's Rugby,Gold
6587,3684,Aage Jrgen Christian Andersen,M,22.0,,,Denmark,DEN,1906 Summer,1906,Summer,Athina,Football,Football Men's Football,Gold
72184,36724,Aage Valdemar Harald Frandsen,M,29.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Gymnastics,"Gymnastics Men's Team All-Around, Free System",Gold
122784,62062,"Aagje ""Ada"" Kok (-van der Linden)",F,17.0,183.0,85.0,Netherlands,NED,1964 Summer,1964,Summer,Tokyo,Swimming,Swimming Women's 100 metres Butterfly,Silver
122785,62062,"Aagje ""Ada"" Kok (-van der Linden)",F,17.0,183.0,85.0,Netherlands,NED,1964 Summer,1964,Summer,Tokyo,Swimming,Swimming Women's 4 x 100 metres Medley Relay,Silver
122786,62062,"Aagje ""Ada"" Kok (-van der Linden)",F,21.0,183.0,85.0,Netherlands,NED,1968 Summer,1968,Summer,Mexico City,Swimming,Swimming Women's 100 metres Butterfly,
122787,62062,"Aagje ""Ada"" Kok (-van der Linden)",F,21.0,183.0,85.0,Netherlands,NED,1968 Summer,1968,Summer,Mexico City,Swimming,Swimming Women's 200 metres Butterfly,Gold
122788,62062,"Aagje ""Ada"" Kok (-van der Linden)",F,21.0,183.0,85.0,Netherlands,NED,1968 Summer,1968,Summer,Mexico City,Swimming,Swimming Women's 4 x 100 metres Medley Relay,
245549,122961,"Aale Maria Tynni (-Pirinen, -Haavio)",F,34.0,,,Finland,FIN,1948 Summer,1948,Summer,London,Art Competitions,"Art Competitions Mixed Literature, Lyric Works",Gold
170518,85699,Aaron Nguimbat,M,22.0,,,Cameroon,CMR,2000 Summer,2000,Summer,Sydney,Football,Football Men's Football,Gold


In [33]:
gwinners_df["Medal"].value_counts()

Gold      13372
Silver     3012
Bronze     2329
Name: Medal, dtype: int64

In [34]:
gwinners_df["Name"].value_counts()

Heikki Ilmari Savolainen                                   39
Takashi Ono                                                33
Alfrd (Arnold-) Hajs (Guttmann-)                           32
Andreas Wecker                                             32
Jean Lucien Nicolas Jacoby                                 32
Michael Fred Phelps, II                                    30
Karl Tore William Thoresson                                30
Oksana Aleksandrovna Chusovitina                           29
Ole Einar Bjrndalen                                        27
Yang Wei                                                   26
Gustaf Eric Carlberg                                       26
Aleksandr Vladimirovich Popov                              26
Fabian Hambchen                                            26
Lars Jrgen Madsen                                          26
Gabriella Paruzzi                                          25
Georg "Georges" Miez                                       25
Klaus Ks

# SVM Training

### Different Models
- general shitty one
- different SVMs for different events
- summer vs winter


properties: 
- competed before as a boolean versus number of times they've competed before

#### General First Take
properties:
    [
     athlete has competed before,
     which NCO,                       # one hot encoded,
    ]
    

#### SVM per Event Type
properties:
    [
     height,
     weight,
     age,
     NCO,
     competed before,
    ]
     