# [120 Years of Olympics](https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results)

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score

In [2]:
athelete_events_loc = "./120-years-of-olympic-history-athletes-and-results/athlete_events.csv"
athelete_medals_loc = "./120-years-of-olympic-history-athletes-and-results/athlele_medals.csv"
regions_loc = "./120-years-of-olympic-history-athletes-and-results/noc_regions.csv"
ww2endyear = 1984
cent21st = 2000

medals = ["Gold", "Silver", "Bronze"]

The file athlete_events.csv contains 271116 rows and 15 columns. Each row corresponds to an individual athlete competing in an individual Olympic event (athlete-events).

The file noc_regions.csv contains key value pairs for each National Olympic Committee corresponding to the NOC column in athlete_events.csv. NOCs are responsible for organizing their people's participation in the Olympic Games. They may nominate cities within their respective areas as candidates for future Olympic Games. NOCs also promote the development of athletes and training of coaches and officials at a national level within their geographies.

In [3]:
athevent_df = pd.read_csv(athelete_events_loc)
regions_df = pd.read_csv(regions_loc)

ID - Unique number for each athlete

Name - Athlete's name

Sex - M or F

Age - Integer

Height - In centimeters

Weight - In kilograms

Team - Team name

NOC - National Olympic Committee 3-letter code

Games - Year and season

Year - Integer

Season - Summer or Winter

City - Host city

Sport - Sport

Event - Event

Medal - Gold, Silver, Bronze, or NA

In [4]:
athevent_df.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,


In [5]:
athevent_df.describe()

Unnamed: 0,ID,Age,Height,Weight,Year
count,271116.0,261642.0,210945.0,208241.0,271116.0
mean,68248.954396,25.556898,175.33897,70.702393,1978.37848
std,39022.286345,6.393561,10.518462,14.34802,29.877632
min,1.0,10.0,127.0,25.0,1896.0
25%,34643.0,21.0,168.0,60.0,1960.0
50%,68205.0,24.0,175.0,70.0,1988.0
75%,102097.25,28.0,183.0,79.0,2002.0
max,135571.0,97.0,226.0,214.0,2016.0


In [4]:
cityToCountry = { 
    "Athens":"Greece",
    "Paris":"France",
    "St Louis":"United States",
    "London":"Great Britain",
    "Stockholm":"Sweden",
    "Berlin":"Germany",
    "Antwerp":"Belgium",
    "Chamonix":"France",
    "Paris":"France",
    "St Moritz":"Switzerland",
    "Amsterdam":"Netherlands",
    "Lake Placid":"United States",
    "Los Angeles":"United States",
    "Garmisch Partenkirchen":"Germany",
    "Sapporo":"Japan",
    "Garmisch Partenkirchen":"Germany",
    "Tokyo":"Japan",
    "Helsinki":"Finland",
    "Cortina Ampezzo":"Italy",
    "London":"Great Britain",
    "St Moritz":"Switzerland",
    "London":"Great Britain",
    "Oslo":"Norway",
    "Helsinki":"Finland",
    "Cortina Ampezzo":"Italy",
    "Melbourne":"Australia",
    "Stockholm":"Sweden",
    "Squaw Valley":"United States",
    "Rome":"Italy",
    "Innsbruck":"Austria",
    "Tokyo":"Japan",
    "Grenoble":"France",
    "Mexico City":"Mexico",
    "Sapporo":"Japan",
    "Munich":"West Germany",
    "Innsbruck":"Austria",
    "Montreal":"Canada",
    "Lake Placid":"United States",
    "Moscow":"Soviet Union",
    "Sarajevo":"Yugoslavia",
    "Los Angeles":"United States",
    "Calgary":"Canada",
    "Seoul":"South Korea",
    "Albertville":"France",
    "Barcelona":"Spain",
    "Lillehammer":"Norway",
    "Atlanta":"United States",
    "Nagano":"Japan",
    "Sydney":"Australia",
    "Salt Lake City":"United States",
    "Athens":"Greece",
    "Turin":"Italy",
    "Beijing":"China",
    "Vancouver":"Canada",
    "Singapore":"Singapore",
    "Innsbruck":"Austria",
    "London":"Great Britain",
    "Sochi":"Russia",
    "Nanjing":"China",
    "Lillehammer":"Norway",
    "Rio":"Brazil",
    "Athina":"Greece",
    "St. Louis":"United States",
    "Antwerpen":"Belgium",
    "Sankt Moritz":"Switzerland",
    "Garmisch-Partenkirchen":"Germany",
    "Cortina d'Ampezzo":"Italy",
    "Roma":"Italy",
    "Moskva":"Russia",
    "Torino":"Italy",
    "Rio de Janeiro":"Brazil",
}

In [5]:
cities = cityToCountry.keys()
for city in athevent_df["City"].unique():
    if city not in cities:
        print city

In [6]:
countries = athevent_df["Team"].unique()
for city, country in cityToCountry.items():
    if country not in countries:
        print city, country

In [565]:
print len(athevent_df["City"].unique())
athevent_df["City"].unique()

42


array(['Athina', 'Paris', 'St. Louis', 'London', 'Stockholm', 'Antwerpen',
       'Chamonix', 'Amsterdam', 'Sankt Moritz', 'Los Angeles',
       'Lake Placid', 'Berlin', 'Garmisch-Partenkirchen', 'Helsinki',
       'Oslo', 'Melbourne', "Cortina d'Ampezzo", 'Roma', 'Squaw Valley',
       'Tokyo', 'Innsbruck', 'Grenoble', 'Mexico City', 'Munich',
       'Sapporo', 'Montreal', 'Moskva', 'Sarajevo', 'Seoul', 'Calgary',
       'Barcelona', 'Albertville', 'Lillehammer', 'Atlanta', 'Nagano',
       'Sydney', 'Salt Lake City', 'Torino', 'Beijing', 'Vancouver',
       'Sochi', 'Rio de Janeiro'], dtype=object)

In [7]:
yearsGuessed = []
for year in athevent_df["Year"].unique():
    for index, item in athevent_df[athevent_df["Year"] == year].iterrows():
        if item["Team"] == cityToCountry[item["City"]]:
            yearsGuessed.append(year)
            print year, item["City"], item["Team"], cityToCountry[item["City"]]
            break

1992 Barcelona Spain Spain
2012 London Great Britain Great Britain
1920 Antwerpen Belgium Belgium
1900 Paris France France
1988 Seoul South Korea South Korea
1994 Lillehammer Norway Norway
1932 Los Angeles United States United States
2002 Salt Lake City United States United States
1952 Helsinki Finland Finland
1980 Lake Placid United States United States
2000 Sydney Australia Australia
1996 Atlanta United States United States
1912 Stockholm Sweden Sweden
1924 Paris France France
2014 Sochi Russia Russia
1948 London Great Britain Great Britain
1998 Nagano Japan Japan
2006 Torino Italy Italy
2008 Beijing China China
2016 Rio de Janeiro Brazil Brazil
2004 Athina Greece Greece
1960 Squaw Valley United States United States
1964 Tokyo Japan Japan
1984 Los Angeles United States United States
1968 Mexico City Mexico Mexico
1972 Munich West Germany West Germany
1936 Berlin Germany Germany
1956 Melbourne Australia Australia
1928 Amsterdam Netherlands Netherlands
1976 Montreal Canada Canada
2010 

In [8]:
for year in athevent_df["Year"].unique():
    if year not in yearsGuessed:
        print year

In [9]:
# top ten countries with medals
athevent_df["Team"].value_counts()

United States         17847
France                11988
Great Britain         11404
Italy                 10260
Germany                9326
Canada                 9279
Japan                  8289
Sweden                 8052
Australia              7513
Hungary                6547
Poland                 6143
Switzerland            5844
Netherlands            5718
Soviet Union           5535
Finland                5379
Spain                  5224
China                  4975
Russia                 4922
Austria                4866
Norway                 4708
Czechoslovakia         4352
South Korea            4344
Romania                4303
Brazil                 3772
Belgium                3687
Bulgaria               3518
Denmark                3424
West Germany           3199
Argentina              3199
Greece                 2976
                      ...  
Nadine                    1
Crabe I-2                 1
Crabe I-3                 1
Carabinier-15             1
Mainz               

In [10]:
# top ten National Olympic Committees with medals
athevent_df["NOC"].value_counts()[:10]

USA    18853
FRA    12758
GBR    12256
ITA    10715
GER     9830
CAN     9733
JPN     8444
SWE     8339
AUS     7638
HUN     6607
Name: NOC, dtype: int64

In [11]:
# the number of teams to have ever competed in the olympics
athevent_df["Team"].nunique()

1184

In [12]:
# number of sports played ever
print "The number of sports that have been recognized by the Olympics: " + str(athevent_df["Sport"].nunique())

The number of sports that have been recognized by the Olympics: 66


In [13]:
# number of events played ever
print "The number of unique events at the Olympics: " + str(athevent_df["Event"].nunique())

The number of unique events at the Olympics: 765


In [14]:
gwinners_df["Medal"].value_counts()

NameError: name 'gwinners_df' is not defined

In [15]:
gwinners_df["Name"].value_counts()

NameError: name 'gwinners_df' is not defined

In [16]:
# number of different cities that have hosted the olympic
print "The number of cities that have hosted the Olympics: " + str(athevent_df["City"].nunique())

The number of cities that have hosted the Olympics: 42


## Data Manipulation

In [17]:
sport_events = defaultdict(set)
for index, row in athevent_df.iterrows():
    sport_events[row['Sport']].add(row['Event'])

In [18]:
sport_events

defaultdict(set,
            {'Aeronautics': {'Aeronautics Mixed Aeronautics'},
             'Alpine Skiing': {"Alpine Skiing Men's Combined",
              "Alpine Skiing Men's Downhill",
              "Alpine Skiing Men's Giant Slalom",
              "Alpine Skiing Men's Slalom",
              "Alpine Skiing Men's Super G",
              "Alpine Skiing Women's Combined",
              "Alpine Skiing Women's Downhill",
              "Alpine Skiing Women's Giant Slalom",
              "Alpine Skiing Women's Slalom",
              "Alpine Skiing Women's Super G"},
             'Alpinism': {'Alpinism Mixed Alpinism'},
             'Archery': {"Archery Men's Au Chapelet, 33 metres",
              "Archery Men's Au Chapelet, 50 metres",
              "Archery Men's Au Cordon Dore, 33 metres",
              "Archery Men's Au Cordon Dore, 50 metres",
              "Archery Men's Championnat Du Monde",
              "Archery Men's Continental Style",
              "Archery Men's Double Americ

In [19]:
# retain only medal win entries
athevent_df.dropna(subset=["Medal"]).to_csv(athelete_medals_loc)

In [20]:
winners = athevent_df.dropna(subset=["Medal"])
# winners[winners["Name"] == "Heikki Ilmari Savolainen"].sort_values("Year")

In [21]:
gold_winners = (list(athevent_df[athevent_df["Medal"] == "Gold"]["Name"]))
gwinners_df = athevent_df[athevent_df["Name"].isin(gold_winners)]

In [22]:
gwinners_df.sort_values("Name")

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
4015,2316,A. Albert,M,,,,Union des Socits Franais de Sports Athletiques,FRA,1900 Summer,1900,Summer,Paris,Rugby,Rugby Men's Rugby,Gold
6587,3684,Aage Jrgen Christian Andersen,M,22.0,,,Denmark,DEN,1906 Summer,1906,Summer,Athina,Football,Football Men's Football,Gold
72184,36724,Aage Valdemar Harald Frandsen,M,29.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Gymnastics,"Gymnastics Men's Team All-Around, Free System",Gold
122784,62062,"Aagje ""Ada"" Kok (-van der Linden)",F,17.0,183.0,85.0,Netherlands,NED,1964 Summer,1964,Summer,Tokyo,Swimming,Swimming Women's 100 metres Butterfly,Silver
122785,62062,"Aagje ""Ada"" Kok (-van der Linden)",F,17.0,183.0,85.0,Netherlands,NED,1964 Summer,1964,Summer,Tokyo,Swimming,Swimming Women's 4 x 100 metres Medley Relay,Silver
122786,62062,"Aagje ""Ada"" Kok (-van der Linden)",F,21.0,183.0,85.0,Netherlands,NED,1968 Summer,1968,Summer,Mexico City,Swimming,Swimming Women's 100 metres Butterfly,
122787,62062,"Aagje ""Ada"" Kok (-van der Linden)",F,21.0,183.0,85.0,Netherlands,NED,1968 Summer,1968,Summer,Mexico City,Swimming,Swimming Women's 200 metres Butterfly,Gold
122788,62062,"Aagje ""Ada"" Kok (-van der Linden)",F,21.0,183.0,85.0,Netherlands,NED,1968 Summer,1968,Summer,Mexico City,Swimming,Swimming Women's 4 x 100 metres Medley Relay,
245549,122961,"Aale Maria Tynni (-Pirinen, -Haavio)",F,34.0,,,Finland,FIN,1948 Summer,1948,Summer,London,Art Competitions,"Art Competitions Mixed Literature, Lyric Works",Gold
170518,85699,Aaron Nguimbat,M,22.0,,,Cameroon,CMR,2000 Summer,2000,Summer,Sydney,Football,Football Men's Football,Gold


In [23]:
# create list of each competing team
team_list = list(athevent_df["Team"].value_counts().index)
year_list = set(athevent_df["Year"].sort_values())
sport_list = list(athevent_df["Sport"].value_counts().index)

# SVM Training

### Different Models
- general shitty one
- different SVMs for different events
- summer vs winter


properties: 
- competed before as a boolean versus number of times they've competed before

#### General First Take
properties:
    [
     not their first olympics,
     which NCO,                       # one hot encoded,
    ]
    

#### SVM per Event Type
properties:
    [
     height,
     weight,
     age,
     NCO,
     competed before,
    ]
     

In [24]:
athevent_df = athevent_df.sort_values("Year")

In [25]:
win_col = [] # list to use for a new column to determine whether the ath won a medal in the event
sex_col = [] # list to use for a new column to determine sex; M = 0 F = 1
athfist_list = defaultdict(int) # dict of ath first olympics
first_col = []
prevwin_col = []
for index, row in athevent_df.iterrows():
    name = row["Name"]
    # determine if they won a medal
    if (row["Medal"] in medals):
        win_col.append(1)
    else:
        win_col.append(0)
    
    # determine the sex
    if (row["Sex"] == "F"):
        sex_col.append(1)
    else:
        sex_col.append(0)
    
    # determine if they have competed before
    if (athfist_list[name] == 0):
        first_col.append(0)
        athfist_list[name] = row["Year"]
    else:
        if(athfist_list[name] == row["Year"]):
            first_col.append(0)
        else:
            first_col.append(1) 
    

In [26]:
athevent_df["sexb"] = sex_col
athevent_df["repeat"] = first_col
athevent_df["win"] = win_col

In [27]:
len(athevent_df.index)

271116

## create training and validation sets

### training/validation sets for first run through

In [28]:
golds = defaultdict(int)
silvers = defaultdict(int)
bronzes = defaultdict(int)

medalCounts = [golds, silvers, bronzes]
for index, row in athevent_df.iterrows():
    name = row["Name"]
    unique_id = name + row["Sport"]
    
    medal = row["Medal"]
    
    for i in range(len(medals)):
        if medals[i] == medal:
            medalCounts[i][unique_id] += 1


In [29]:
def feat(datum):
    feat = []
    team = [0] * len(team_list)
    team[team_list.index(datum["Team"])] = 1
    
    unique_id = datum["Name"] + datum["Sport"]
    home_adv  = 1 if datum["Team"] == cityToCountry[datum["City"]] else 0
    
    #feat.append(datum["repeat"])
    #feat.append(datum["Height"])
    #feat.append(datum["Weight"])
    #feat.append(datum["Age"])
    feat.extend(team)
    feat.append(medalCounts[0][unique_id])
    feat.append(medalCounts[1][unique_id])
    feat.append(medalCounts[2][unique_id])
    feat.append(home_adv)
    feat.append(1)
    return feat

In [30]:
train_df     = athevent_df[(athevent_df["Year"] > ww2endyear) & (athevent_df["Year"] < cent21st)]
valid_df     = athevent_df[(athevent_df["Year"] >= cent21st)]
valid2000_df = athevent_df[(athevent_df["Year"] == cent21st)]

In [31]:
train_df = train_df.dropna(subset=["Height", "Weight", "Age"])
valid_df = valid_df.dropna(subset=["Height", "Weight", "Age"])
valid2000_df = valid2000_df.dropna(subset=["Height", "Weight", "Age"])

### training/validation sets for prediction 2016 without knowledge of 2016

In [32]:
def medalsUpToYear(year):
    goldsyear = defaultdict(int)
    silversyear = defaultdict(int)
    bronzesyear = defaultdict(int)
    
    medalCountsYear = [goldsyear, silversyear, bronzesyear]
    for index, row in athevent_df[athevent_df["Year"] < year].iterrows():
        name = row["Name"]
        unique_id = name + row["Sport"]
        
        medal = row["Medal"]
        
        for i in range(len(medals)):
            if medals[i] == medal:
                medalCountsYear[i][unique_id] += 1
    return medalCountsYear


In [33]:
def medalsUpToYearGivenPrevious(previousCounts, previousYear):
    medalCountsYear = previousCounts
    for index, row in athevent_df[athevent_df["Year"] == previousYear].iterrows():
        name = row["Name"]
        unique_id = name + row["Sport"]
        
        medal = row["Medal"]
        
        for i in range(len(medals)):
            if medals[i] == medal:
                medalCountsYear[i][unique_id] += 1
    return medalCountsYear


In [34]:
def featyear(datum, medalCountsYear):
    feat = []
    team = [0] * len(team_list)
    team[team_list.index(datum["Team"])] = 1
    
    unique_id = datum["Name"] + datum["Sport"]
    home_adv  = 1 if datum["Team"] == cityToCountry[datum["City"]] else 0
    
    feat.append(datum["repeat"])
    feat.append(datum["Height"])
    feat.append(datum["Weight"])
    feat.append(datum["Age"])
    feat.extend(team)
    feat.append(medalCountsYear[0][unique_id])
    feat.append(medalCountsYear[1][unique_id])
    feat.append(medalCountsYear[2][unique_id])
    feat.append(home_adv)
    feat.append(1)
    return feat

In [35]:
medalCounts2016 = medalsUpToYear(2016)

In [36]:
# testing that medal counts do actually change
print sorted(medalCounts[0].items(), key=lambda x: x[1])[::-1][0]
print sorted(medalCounts2016[0].items(), key=lambda x: x[1])[::-1][0]

('Michael Fred Phelps, IISwimming', 23)
('Michael Fred Phelps, IISwimming', 18)


In [37]:
train2016_df     = athevent_df[(athevent_df["Year"] > ww2endyear) & (athevent_df["Year"] < 2016)]
valid2016_df     = athevent_df[(athevent_df["Year"] >= 2016)]

In [38]:
train2016_df = train2016_df.dropna(subset=["Height", "Weight", "Age"])
valid2016_df = valid2016_df.dropna(subset=["Height", "Weight", "Age"])

#### shitty cells im too lazy to delete

In [39]:
len(train_df.index) 

45072

In [40]:
train_df["Year"].unique()

array([1988, 1992, 1994, 1996, 1998])

In [41]:
len(train_df.index) 

45072

### functions for pretty printing

In [42]:
def printAccuracy(year, sport, event, l):
    [ accur, base_accur, cf_mtrx, p_cision ] = l
    print sport, event
    print year, "Accuracy: ", accur
    print "     Baseline: ", base_accur
    print "    Precision: ", p_cision
    print "           TN: ", cf_mtrx[0][0], '\t', "FP: ", cf_mtrx[0][1]
    print "           FN: ", cf_mtrx[1][0], '\t', "TP: ", cf_mtrx[1][1]
    print

In [88]:
def sportAccuracy(sport, accuracies):
    for year in sorted(accuracies.keys()):
        if not accuracies[year][sport].keys():
            continue
        accuracy_year = 0.0
        baseline_year = 0.0
        cf_mtrx = [[0,0],[0,0]]
        precision = 0.0
        for event in accuracies[year][sport].keys():
            if not len(accuracies[year][sport][event]): continue
            accuracy_year += accuracies[year][sport][event][0]
            baseline_year += accuracies[year][sport][event][1]
            cf_mtrx       += accuracies[year][sport][event][2]
            precision     += accuracies[year][sport][event][3]
        print year, "Accuracy: ", accuracy_year / len(accuracies[year][sport].keys())
        print "     Baseline: ", baseline_year / len(accuracies[year][sport].keys())
        print "    Precision: ", precision / len(accuracies[year][sport].keys())
        print "           TN: ", cf_mtrx[0][0], '\t', "FP: ", cf_mtrx[0][1]
        print "           FN: ", cf_mtrx[1][0], '\t', "TP: ", cf_mtrx[1][1]
        print

### train SVM with data from WW2 to 1996

In [44]:
sport_svm = defaultdict(int) 
for sport in train_df["Sport"].unique():
    temptrain_df = train_df[train_df["Sport"] == sport]
    train_X = [feat(row) for index, row in temptrain_df.iterrows()]
    train_y = temptrain_df["win"]
    clf = LinearSVC(C=.1)
    cur_clf = clf.fit(train_X, train_y)
    sport_svm[sport] = cur_clf

### train SVM with data from WW2 to 2012

In [45]:
sport_svm2016 = defaultdict(int) 
for sport in train2016_df["Sport"].unique():
    temptrain2016_df = train2016_df[train2016_df["Sport"] == sport]
    train2016_X = [featyear(row, medalCounts2016) for index, row in temptrain2016_df.iterrows()]
    train2016_y = temptrain2016_df["win"]
    clf = LinearSVC(C=.1)
    cur_clf = clf.fit(train2016_X, train2016_y)
    sport_svm2016[sport] = cur_clf



### predict for years 2000 to 2016

In [46]:
sports_inter = list(train_df["Sport"].unique()) or list(valid_df["Sport"].unique())
accuracies = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
sport_conf =  defaultdict(int)  
avg_accur_sum, avg_baseacc_sum = 0, 0

better, better_sports, worse_sports, better_events, worse_events = 0.0, set(), set(), set(), set()
num_events = 0

for year in valid_df["Year"].unique():
    print year,
    temp_year_df = valid_df[valid_df["Year"] == year]
    for sport in sports_inter:
        temp_sports_year_df = temp_year_df[temp_year_df["Sport"] == sport]
        for event in temp_sports_year_df["Event"].unique():
            num_events += 1
            
            cur_clf      = sport_svm[sport]
            tempvalid_df = temp_sports_year_df[temp_sports_year_df["Event"] == event]
            
            valid_X = [feat(row) for index, row in tempvalid_df.iterrows()]
            
            if (len(valid_X) == 0):
                continue
            valid_y = tempvalid_df["win"]

            decisions = cur_clf.decision_function(valid_X)
            highest_confidence = [ x[1] for x in sorted(zip(decisions, range(len(decisions))))[::-1][:sum(valid_y)]]
            predictions = [ 1 if i in highest_confidence else 0 for i, x in enumerate(decisions) ]
            
            accur = accuracy_score(valid_y, predictions)
            
            base_y = [0] * len(valid_y)
            base_accur = accuracy_score(valid_y, base_y)
            
            avg_accur_sum += accur
            avg_baseacc_sum += base_accur

            if (accur > base_accur): 
                better+=1
                better_sports.add(sport)
                better_events.add(event)
            else:
                worse_sports.add(sport)
                worse_events.add(event)
                
            accuracies[year][sport][event] = [accur, base_accur, confusion_matrix(valid_y, predictions), precision_score(valid_y, predictions)]
print "done"

2000 2002 2004 2006 2008 2010 2012 2014 2016 done


In [47]:
avg_accur = avg_accur_sum / num_events 
avg_baseacc = avg_baseacc_sum / num_events
print "Overall  Accuracy of SVMs:", avg_accur
print "Baseline Accuracy of SVMs:", avg_baseacc
print "SVMs  better  than BLines:", better, "out of", num_events, "=", better / num_events
print
print "Sports with all events better than baseline:\n", list(better_sports - worse_sports)
print "Sports with all events worse than baseline:\n", list(worse_sports - better_sports) if worse_sports - better_sports else "None"

Overall  Accuracy of SVMs: 0.9030661347337419
Baseline Accuracy of SVMs: 0.8560552313822649
SVMs  better  than BLines: 1164.0 out of 1785 = 0.652100840336

Sports with all events better than baseline:
['Ice Hockey', 'Football', 'Baseball', 'Hockey', 'Softball']
Sports with all events worse than baseline:
None


In [468]:
# sportAccuracy("Gymnastics", accuracies)

In [469]:
# printAccuracy(2016, "Basketball", "Basketball Men's Basketball", accuracies[2016]["Basketball"]["Basketball Men's Basketball"])

Overall  Accuracy of SVMs: 0.9056740354896968
Baseline Accuracy of SVMs: 0.8560552313822636
SVMs  better  than BLines: 1173.0 out of 1785 = 0.657142857143

Sports with all events better than baseline:
['Ice Hockey', 'Basketball', 'Football', 'Baseball', 'Hockey', 'Softball', 'Luge']
Sports with all events worse than baseline:
None


no height weight age or home adv
Overall  Accuracy of SVMs: 0.9054699023117135
Baseline Accuracy of SVMs: 0.8560552313822636
SVMs  better  than BLines: 1174.0 out of 1785 = 0.657703081232

Sports with all events better than baseline:
['Ice Hockey', 'Basketball', 'Football', 'Baseball', 'Hockey', 'Softball', 'Luge']
Sports with all events worse than baseline:
None

with all the data, and with height and weight as features
Overall  Accuracy of SVMs: 0.9120422542971776
Baseline Accuracy of SVMs: 0.8560552313822636
SVMs  better  than BLines: 1233.0 out of 1785 = 0.690756302521

Sports with all events better than baseline:
['Ice Hockey', 'Basketball', 'Football', 'Volleyball', 'Baseball', 'Hockey', 'Softball', 'Handball', 'Water Polo']
Sports with all events worse than baseline:
None

### predict for 2016 without 2016 data

In [488]:
sport_svm2016

defaultdict(int, {'Athletics': 0})




In [48]:
sports_inter2016 = list(train2016_df["Sport"].unique()) or list(valid2016_df["Sport"].unique())
accuracies2016   = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
sport_conf2016   = defaultdict(int)  

avg_accur_sum2016, avg_baseacc_sum2016, num_events2016 = 0, 0, 0
better2016, better2016_sports, worse2016_sports, better2016_events, worse2016_events = 0.0, set(), set(), set(), set()

for year in valid2016_df["Year"].unique():
    print year,
    temp_year_df = valid2016_df[valid2016_df["Year"] == year]
    for sport in sports_inter2016:
        temp_sports_year_df = temp_year_df[temp_year_df["Sport"] == sport]
        for event in temp_sports_year_df["Event"].unique():
            num_events2016 += 1
            
            cur_clf      = sport_svm2016[sport]
            tempvalid2016_df = temp_sports_year_df[temp_sports_year_df["Event"] == event]
            
            valid2016_X = [featyear(row, medalCounts2016) for index, row in tempvalid2016_df.iterrows()]
            
            if (len(valid2016_X) == 0):
                continue
            valid2016_y = tempvalid2016_df["win"]

            decisions = cur_clf.decision_function(valid2016_X)
            highest_confidence = [ x[1] for x in sorted(zip(decisions, range(len(decisions))))[::-1][:sum(valid2016_y)]]
            predictions = [ 1 if i in highest_confidence else 0 for i, x in enumerate(decisions) ]
            
            accur = accuracy_score(valid2016_y, predictions)
            
            base_y = [0] * len(valid2016_y)
            base_accur = accuracy_score(valid2016_y, base_y)
            
            avg_accur_sum2016 += accur
            avg_baseacc_sum2016 += base_accur

            if (accur > base_accur): 
                better2016+=1
                better2016_sports.add(sport)
                better2016_events.add(event)
            else:
                worse2016_sports.add(sport)
                worse2016_events.add(event)
                
            accuracies2016[year][sport][event] = [accur, base_accur, confusion_matrix(valid2016_y, predictions), precision_score(valid2016_y, predictions)]
print "done"

2016 done


In [49]:
avg_accur = avg_accur_sum2016 / num_events2016
avg_baseacc = avg_baseacc_sum2016 / num_events2016
print "Overall  Accuracy of SVMs:", avg_accur
print "Baseline Accuracy of SVMs:", avg_baseacc
print "SVMs  better2016  than BLines:", better2016, "out of", num_events2016, "=", better2016 / num_events2016
print
print "Sports with all events better2016 than baseline:\n", list(better2016_sports - worse2016_sports)
print "Sports with all events worse2016 than baseline:\n", list(worse2016_sports - better2016_sports) if worse2016_sports - better2016_sports else "None"

Overall  Accuracy of SVMs: 0.8278588343658476
Baseline Accuracy of SVMs: 0.8473941213353378
SVMs  better2016  than BLines: 88.0 out of 302 = 0.291390728477

Sports with all events better2016 than baseline:
['Synchronized Swimming', 'Rhythmic Gymnastics']
Sports with all events worse2016 than baseline:
['Trampolining', 'Modern Pentathlon', 'Boxing', 'Handball', 'Volleyball']


In [50]:
sportAccuracy("Synchronized Swimming", accuracies2016)

2016 Accuracy:  0.844047619047619
     Baseline:  0.7517857142857143
    Precision:  0.6794871794871795
           TN:  76 	FP:  10
           FN:  10 	TP:  22



In [51]:
sportAccuracy("Rhythmic Gymnastics", accuracies2016)

2016 Accuracy:  0.8615384615384616
     Baseline:  0.8351648351648351
    Precision:  0.6
           TN:  70 	FP:  8
           FN:  8 	TP:  10



In [509]:
for sport in sorted(sports_inter2016):
    print sport
    sportAccuracy(sport, accuracies2016)

Alpine Skiing
Archery
2016 Accuracy:  0.8432539682539683
     Baseline:  0.8511904761904762
    Precision:  0.41666666666666663
           TN:  161 	FP:  13
           FN:  13 	TP:  11

Athletics
2016 Accuracy:  0.8992355716994948
     Baseline:  0.9170327095551685
    Precision:  0.37722152690863575
           TN:  2096 	FP:  114
           FN:  114 	TP:  74

Badminton
2016 Accuracy:  0.8143350311262025
     Baseline:  0.8582427843803057
    Precision:  0.3466666666666667
           TN:  132 	FP:  15
           FN:  15 	TP:  8

Baseball
Basketball
2016 Accuracy:  0.7281493953773152
     Baseline:  0.742844022654217
    Precision:  0.4722222222222222
           TN:  170 	FP:  38
           FN:  38 	TP:  34

Beach Volleyball
2016 Accuracy:  0.875
     Baseline:  0.875
    Precision:  0.5
           TN:  78 	FP:  6
           FN:  6 	TP:  6

Biathlon
Bobsleigh
Boxing
2016 Accuracy:  0.6954563954563954
     Baseline:  0.7906375106375106
    Precision:  0.25
           TN:  182 	FP:  38
  

## Predict for year with year-4 data

In [52]:
predict_years = [2000, 2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016]

In [53]:
train_year_df = {}
valid_year_df = {}

for year in predict_years:
    train_year_df[year] = athevent_df[(athevent_df["Year"] > ww2endyear) & (athevent_df["Year"] < year)].dropna(subset=["Height", "Weight", "Age"])
    valid_year_df[year] = athevent_df[(athevent_df["Year"] == year)].dropna(subset=["Height", "Weight", "Age"])

In [54]:
# sports_inter_year = list(train_year_df["Sport"].unique()) or list(valid_year_df["Sport"].unique())
# accuracies_year   = defaultdict(defaultdict(lambda: defaultdict(lambda: defaultdict(list))))

In [70]:
import copy




In [71]:
# faster but not finishd implementing
medalCountsYears = {}
for i, year in enumerate(predict_years):
    print year,
    if i == 0:
        medalCountsYears[year] = medalsUpToYear(year)
        continue
    
    medalCountsYears[year] = medalsUpToYearGivenPrevious(copy.deepcopy(medalCountsYears[predict_years[i-1]]), predict_years[i-1])

2000 2002 2004 2006 2008 2010 2012 2014 2016


In [None]:
# medalCountsYears = {}
# for year in predict_years:
#     print year,
#     medalCountsYears[year] = medalsUpToYear(year)

In [74]:
# testing that medal counts do actually change
print sorted(medalCountsYears[2012][0].items(), key=lambda x: x[1])[::-1][0]
print sorted(medalCountsYears[2016][0].items(), key=lambda x: x[1])[::-1][0]

('Michael Fred Phelps, IISwimming', 14)
('Michael Fred Phelps, IISwimming', 18)


In [525]:
train_year_df[year]["Sport"][:3]

90816      Athletics
217936      Swimming
149204    Gymnastics
Name: Sport, dtype: object

In [129]:
import numpy
import scipy.optimize
import random
from math import exp
from math import log
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.utils.fixes import signature




In [131]:
sport_svm_years = defaultdict(int) 
sport_lr_years = {}
for year in predict_years:
    print year,
    sport_svm_year = defaultdict(int)
    sport_lr_year  = {}

    for sport in train_year_df[year]["Sport"].unique():
        temptrain_year_df = train_year_df[year][train_year_df[year]["Sport"] == sport]
        train_year_X = [featyear(row, medalCountsYears[year]) for index, row in temptrain_year_df.iterrows()]
        train_year_y = list(temptrain_year_df["win"])
        sport_lr_year[sport] = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(train_year_X, train_year_y)
        clf = LinearSVC(C=.1)
        cur_clf = clf.fit(train_year_X, train_year_y)
        sport_svm_year[sport] = cur_clf
    sport_svm_years[year] = sport_svm_year
    sport_lr_years[year] = sport_lr_year

2000 2002 2004 2006 2008 2010 2012 2014 2016


In [132]:
##################################################
# Validation pipeline                            #
##################################################

# for year in predict_years:
#     lam = 1.0
#     temptrain_year_df = train_year_df[year][train_year_df[year]["Sport"] == sport]
#     X_train = [featyear(row, medalCountsYears[year]) for index, row in temptrain_year_df.iterrows()]
#     y_train = temptrain_year_df["win"]
    
    # sport_lr_years = train(lam, X_train, X_train, y_train)
    # acc_train    = performance(theta, X_train, y_train)
    # acc_validate = performance(theta, X_validate, y_validate)
    # acc_test     = performance(theta, X_test, y_test)

    # print("lambda = " + str(lam))
    # print("\taccuracy train    = " + str(acc_train))
    # print("\taccuracy validate = " + str(acc_validate))
    # print("\taccuracy test     = " + str(acc_test))

In [154]:
accuracies_years      = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: list)))
accuracies_years_lr   = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: list)))

avg_accur_sum_years, avg_baseacc_sum_years, num_events_years = defaultdict(float), defaultdict(float), defaultdict(float)
better_years, better_years_sports, worse_years_sports, better_years_events, worse_years_events = defaultdict(float), defaultdict(set), defaultdict(set), defaultdict(set), defaultdict(set)

avg_accur_sum_years_lr, avg_baseacc_sum_years_lr, num_events_years_lr = defaultdict(float), defaultdict(float), defaultdict(float)
better_years_lr, better_years_sports_lr, worse_years_sports_lr, better_years_events_lr, worse_years_events_lr = defaultdict(float), defaultdict(set), defaultdict(set), defaultdict(set), defaultdict(set)

for year in predict_years:
    sports_inter_years = list(train_year_df[year]["Sport"].unique()) or list(valid_year_df[year]["Sport"].unique())
    temp_year_df = valid_year_df[year]
    print year,
    
    for sport in sports_inter_years:
        temp_sports_year_df = temp_year_df[temp_year_df["Sport"] == sport]
        for event in temp_sports_year_df["Event"].unique():
            num_events_years[year] += 1
            
            cur_clf = sport_svm_years[year][sport]
            lr_clf  = sport_lr_years[year][sport]
            
            tempvalid_years_df = temp_sports_year_df[temp_sports_year_df["Event"] == event]
            
            valid_years_X = [featyear(row, medalCountsYears[year]) for index, row in tempvalid_years_df.iterrows()]
            
            if (len(valid_years_X) == 0):
                continue
            valid_years_y = list(tempvalid_years_df["win"])

            
            # svm
            decisions = cur_clf.decision_function(valid_years_X)
            highest_confidence = [ x[1] for x in sorted(zip(decisions, range(len(decisions))))[::-1][:sum(valid_years_y)]]
            predictions = [ 1 if i in highest_confidence else 0 for i, x in enumerate(decisions) ]
            
            accur = accuracy_score(valid_years_y, predictions)

            base_y = [0] * len(valid_years_y)
            base_accur = accuracy_score(valid_years_y, base_y)
            
            avg_accur_sum_years[year] += accur
            avg_baseacc_sum_years[year] += base_accur

            if (accur > base_accur): 
                better_years[year] +=1
                better_years_sports[year].add(sport)
                better_years_events[year].add(event)
            else:
                worse_years_sports[year].add(sport)
                worse_years_events[year].add(event)
                
            accuracies_years[year][sport][event] = [accur, base_accur, confusion_matrix(valid_years_y, predictions), precision_score(valid_years_y, predictions)]
            # end svm
            
            ####################################################################
            
            # log. reg.
            decisions_lr = [x[0] for x in lr_clf.predict_proba(valid_years_X)]
            highest_confidence_lr = [ x[1] for x in sorted(zip(decisions_lr, range(len(decisions_lr))))[::-1][:sum(valid_years_y)] ]
            predictions_lr = [ 1 if i in highest_confidence_lr else 0 for i, x in enumerate(decisions) ]
            
            accur_lr = accuracy_score(valid_years_y, predictions_lr)
            
            avg_accur_sum_years_lr[year] += accur
            avg_baseacc_sum_years_lr[year] += base_accur

            if (accur_lr > base_accur): 
                better_years_lr[year] +=1
                better_years_sports_lr[year].add(sport)
                better_years_events_lr[year].add(event)
            else:
                worse_years_sports_lr[year].add(sport)
                worse_years_events_lr[year].add(event)
                
            accuracies_years_lr[year][sport][event] = [accur_lr, base_accur, confusion_matrix(valid_years_y, predictions_lr), precision_score(valid_years_y, predictions_lr)]
            # end log. reg.
            
    #        print
    #        print sum(valid_years_y)
    #        print highest_confidence
    #        print decisions_lr
    #        print highest_confidence_lr
    #        break
    #    break
    #break
print "done"

2000 2002 2004 2006 2008 2010 2012 2014 2016 done


In [155]:
for year in predict_years:

    avg_accur = avg_accur_sum_years[year] / num_events_years[year]
    avg_baseacc = avg_baseacc_sum_years[year] / num_events_years[year]
    print year, avg_accur
    print "\tOverall  Accuracy of SVMs:", avg_accur
    print "\tBaseline Accuracy of SVMs:", avg_baseacc
    print "\tSVMs  better_years  than BLines:", better_years[year], "out of", num_events_years[year], "=", better_years[year] / num_events_years[year]
    print
    print "\tSports with all events better_years than baseline:\n", "\t", list(better_years_sports[year] - worse_years_sports[year])
    print "\tSports with all events worse_years than baseline:\n", "\t",list(worse_years_sports[year] - better_years_sports[year]) if worse_years_sports[year] - better_years_sports[year] else "None"
    print
    print

2000 0.8306894231813867
	Overall  Accuracy of SVMs: 0.8306894231813867
	Baseline Accuracy of SVMs: 0.8552436188108772
	SVMs  better_years  than BLines: 83.0 out of 288.0 = 0.288194444444

	Sports with all events better_years than baseline:
	['Handball', 'Basketball', 'Baseball', 'Hockey', 'Softball']
	Sports with all events worse_years than baseline:
	['Water Polo', 'Tennis', 'Rhythmic Gymnastics', 'Volleyball', 'Table Tennis']


2002 0.8641179298825596
	Overall  Accuracy of SVMs: 0.8641179298825596
	Baseline Accuracy of SVMs: 0.8859974857322885
	SVMs  better_years  than BLines: 14.0 out of 76.0 = 0.184210526316

	Sports with all events better_years than baseline:
	[]
	Sports with all events worse_years than baseline:
	['Snowboarding', 'Alpine Skiing', 'Ski Jumping', 'Nordic Combined', 'Speed Skating']


2004 0.8251651192932976
	Overall  Accuracy of SVMs: 0.8251651192932976
	Baseline Accuracy of SVMs: 0.8522857682243186
	SVMs  better_years  than BLines: 76.0 out of 301.0 = 0.2524916943

In [156]:
for year in predict_years:

    avg_accur = avg_accur_sum_years_lr[year] / num_events_years[year]
    avg_baseacc = avg_baseacc_sum_years_lr[year] / num_events_years[year]
    print year, avg_accur
    print "\tOverall  Accuracy of SVMs:", avg_accur
    print "\tBaseline Accuracy of SVMs:", avg_baseacc
    print "\tLRs  better_years  than BLines:", better_years_lr[year], "out of", num_events_years[year], "=", better_years_lr[year] / num_events_years[year]
    print
    print "\tSports with all events better_years than baseline:\n", "\t", list(better_years_sports_lr[year] - worse_years_sports_lr[year])
    print "\tSports with all events worse_years than baseline:\n", "\t",list(worse_years_sports_lr[year] - better_years_sports_lr[year]) if worse_years_sports_lr[year] - better_years_sports_lr[year] else "None"
    print
    print

2000 0.8306894231813867
	Overall  Accuracy of SVMs: 0.8306894231813867
	Baseline Accuracy of SVMs: 0.8552436188108772
	SVMs  better_years  than BLines: 7.0 out of 288.0 = 0.0243055555556

	Sports with all events better_years than baseline:
	['Water Polo']
	Sports with all events worse_years than baseline:
	['Basketball', 'Cycling', 'Fencing', 'Boxing', 'Beach Volleyball', 'Hockey', 'Rhythmic Gymnastics', 'Synchronized Swimming', 'Modern Pentathlon', 'Football', 'Baseball', 'Softball', 'Archery', 'Tennis', 'Judo', 'Sailing', 'Table Tennis', 'Volleyball', 'Athletics', 'Canoeing', 'Wrestling', 'Badminton', 'Gymnastics', 'Handball', 'Diving', 'Equestrianism']


2002 0.8641179298825596
	Overall  Accuracy of SVMs: 0.8641179298825596
	Baseline Accuracy of SVMs: 0.8859974857322885
	SVMs  better_years  than BLines: 0.0 out of 76.0 = 0.0

	Sports with all events better_years than baseline:
	[]
	Sports with all events worse_years than baseline:
	['Ice Hockey', 'Curling', 'Speed Skating', 'Figure 

In [157]:
sportAccuracy("Ice Hockey", accuracies_years)

2002 Accuracy:  0.717948717948718
     Baseline:  0.7019230769230769
    Precision:  0.5106060606060606
           TN:  280 	FP:  62
           FN:  62 	TP:  64

2006 Accuracy:  0.7644060283687943
     Baseline:  0.686613475177305
    Precision:  0.5933098591549295
           TN:  256 	FP:  55
           FN:  55 	TP:  76

2010 Accuracy:  0.7620192307692307
     Baseline:  0.682451923076923
    Precision:  0.6094138102334823
           TN:  243 	FP:  50
           FN:  50 	TP:  77

2014 Accuracy:  0.7793693093493227
     Baseline:  0.6887297357317343
    Precision:  0.6459775602769158
           TN:  267 	FP:  46
           FN:  46 	TP:  84



In [158]:
sportAccuracy("Ice Hockey", accuracies_years_lr)

2002 Accuracy:  0.5576923076923077
     Baseline:  0.7019230769230769
    Precision:  0.2545454545454545
           TN:  248 	FP:  94
           FN:  94 	TP:  32

2006 Accuracy:  0.4042996453900709
     Baseline:  0.686613475177305
    Precision:  0.05762910798122065
           TN:  188 	FP:  123
           FN:  123 	TP:  8

2010 Accuracy:  0.40865384615384615
     Baseline:  0.682451923076923
    Precision:  0.05737704918032787
           TN:  173 	FP:  120
           FN:  120 	TP:  7

2014 Accuracy:  0.4977126360204308
     Baseline:  0.6887297357317343
    Precision:  0.16101694915254236
           TN:  202 	FP:  111
           FN:  111 	TP:  19



In [94]:
sportAccuracy("Synchronized Swimming", accuracies_years)

2000 Accuracy:  0.7572463768115942
     Baseline:  0.7563405797101449
    Precision:  0.44666666666666666
           TN:  71 	FP:  15
           FN:  15 	TP:  16

2004 Accuracy:  0.786231884057971
     Baseline:  0.7418478260869565
    Precision:  0.5
           TN:  71 	FP:  13
           FN:  13 	TP:  20

2008 Accuracy:  0.6642156862745098
     Baseline:  0.7463235294117647
    Precision:  0.4487179487179487
           TN:  62 	FP:  22
           FN:  22 	TP:  10

2012 Accuracy:  0.8154538634658665
     Baseline:  0.7286196549137285
    Precision:  0.7166666666666667
           TN:  63 	FP:  11
           FN:  11 	TP:  20

2016 Accuracy:  0.844047619047619
     Baseline:  0.7517857142857143
    Precision:  0.6794871794871795
           TN:  76 	FP:  10
           FN:  10 	TP:  22



In [148]:
sportAccuracy("Synchronized Swimming", accuracies_years_lr)

2000 Accuracy:  0.75
     Baseline:  0.7563405797101449
    Precision:  0.5
           TN:  84 	FP:  2
           FN:  29 	TP:  2

2004 Accuracy:  0.7065217391304348
     Baseline:  0.7418478260869565
    Precision:  0.0
           TN:  80 	FP:  4
           FN:  33 	TP:  0

2008 Accuracy:  0.7107843137254902
     Baseline:  0.7463235294117647
    Precision:  0.0
           TN:  80 	FP:  4
           FN:  32 	TP:  0

2012 Accuracy:  0.6892348087021756
     Baseline:  0.7286196549137285
    Precision:  0.0
           TN:  70 	FP:  4
           FN:  31 	TP:  0

2016 Accuracy:  0.7375
     Baseline:  0.7517857142857143
    Precision:  0.25
           TN:  83 	FP:  3
           FN:  31 	TP:  1



In [159]:
sportAccuracy("Water Polo", accuracies_years)

2000 Accuracy:  0.43790849673202614
     Baseline:  0.6258169934640523
    Precision:  0.20614035087719296
           TN:  93 	FP:  61
           FN:  61 	TP:  16

2004 Accuracy:  0.6748746303201749
     Baseline:  0.6885045647421886
    Precision:  0.5
           TN:  141 	FP:  38
           FN:  38 	TP:  38

2008 Accuracy:  0.5617742242528079
     Baseline:  0.6880830001903675
    Precision:  0.3228744939271255
           TN:  127 	FP:  52
           FN:  52 	TP:  25

2012 Accuracy:  0.6103731815306768
     Baseline:  0.692820999367489
    Precision:  0.34511434511434513
           TN:  131 	FP:  50
           FN:  50 	TP:  26

2016 Accuracy:  0.6718281718281718
     Baseline:  0.6858766233766234
    Precision:  0.5
           TN:  141 	FP:  39
           FN:  39 	TP:  39



In [160]:
sportAccuracy("Water Polo", accuracies_years_lr)

2000 Accuracy:  0.6905480140774258
     Baseline:  0.6258169934640523
    Precision:  0.5978407557354926
           TN:  123 	FP:  31
           FN:  31 	TP:  46

2004 Accuracy:  0.532017487463032
     Baseline:  0.6885045647421886
    Precision:  0.21052631578947367
           TN:  119 	FP:  60
           FN:  60 	TP:  16

2008 Accuracy:  0.6093660765276985
     Baseline:  0.6880830001903675
    Precision:  0.32793522267206476
           TN:  127 	FP:  52
           FN:  52 	TP:  25

2012 Accuracy:  0.5453510436432638
     Baseline:  0.692820999367489
    Precision:  0.23977823977823978
           TN:  123 	FP:  58
           FN:  58 	TP:  18

2016 Accuracy:  0.532092907092907
     Baseline:  0.6858766233766234
    Precision:  0.21794871794871795
           TN:  119 	FP:  61
           FN:  61 	TP:  17



In [91]:
for event in accuracies_years[2012]["Synchronized Swimming"].keys():
    print event
    print accuracies_years[2012]["Synchronized Swimming"][event]
    print len(accuracies_years[2012]["Synchronized Swimming"][event])

Synchronized Swimming
defaultdict(<type 'list'>, {0: []})
1
Synchronized Swimming Women's Team
[0.6774193548387096, 0.5967741935483871, array([[27, 10],
       [10, 15]]), 0.6]
4
Synchronized Swimming Women's Duet
[0.9534883720930233, 0.8604651162790697, array([[36,  1],
       [ 1,  5]]), 0.8333333333333334]
4


All features but the home adv

2000 0.8309957202680263
	Overall  Accuracy of SVMs: 0.8309957202680263
	Baseline Accuracy of SVMs: 0.8552436188108766
	SVMs  better_years  than BLines: 79.0 out of 288.0 = 0.274305555556

	Sports with all events better_years than baseline:
	['Basketball', 'Baseball', 'Softball']
	Sports with all events worse_years than baseline:
	['Rhythmic Gymnastics', 'Tennis', 'Table Tennis', 'Hockey', 'Water Polo', 'Shooting']


2002 0.8621158451644497
	Overall  Accuracy of SVMs: 0.8621158451644497
	Baseline Accuracy of SVMs: 0.8859974857322885
	SVMs  better_years  than BLines: 13.0 out of 76.0 = 0.171052631579

	Sports with all events better_years than baseline:
	[]
	Sports with all events worse_years than baseline:
	['Cross Country Skiing', 'Speed Skating', 'Ski Jumping', 'Alpine Skiing', 'Bobsleigh', 'Nordic Combined', 'Snowboarding']


2004 0.8255137126383294
	Overall  Accuracy of SVMs: 0.8255137126383294
	Baseline Accuracy of SVMs: 0.8522857682243186
	SVMs  better_years  than BLines: 77.0 out of 301.0 = 0.255813953488

	Sports with all events better_years than baseline:
	['Baseball', 'Softball']
	Sports with all events worse_years than baseline:
	['Archery', 'Triathlon', 'Tennis', 'Cycling', 'Modern Pentathlon', 'Boxing', 'Volleyball', 'Beach Volleyball']


2006 0.8579617373613412
	Overall  Accuracy of SVMs: 0.8579617373613412
	Baseline Accuracy of SVMs: 0.8799451767461103
	SVMs  better_years  than BLines: 16.0 out of 84.0 = 0.190476190476

	Sports with all events better_years than baseline:
	[]
	Sports with all events worse_years than baseline:
	['Snowboarding', 'Freestyle Skiing', 'Ski Jumping', 'Bobsleigh']


2008 0.8137011523779988
	Overall  Accuracy of SVMs: 0.8137011523779988
	Baseline Accuracy of SVMs: 0.8470257182149252
	SVMs  better_years  than BLines: 73.0 out of 302.0 = 0.241721854305

	Sports with all events better_years than baseline:
	['Basketball', 'Baseball', 'Softball']
	Sports with all events worse_years than baseline:
	['Taekwondo', 'Tennis', 'Judo', 'Modern Pentathlon', 'Football', 'Volleyball', 'Trampolining', 'Hockey', 'Handball']


2010 0.8561508647802287
	Overall  Accuracy of SVMs: 0.8561508647802287
	Baseline Accuracy of SVMs: 0.8827199653973857
	SVMs  better_years  than BLines: 18.0 out of 86.0 = 0.209302325581

	Sports with all events better_years than baseline:
	['Ice Hockey', 'Curling']
	Sports with all events worse_years than baseline:
	['Figure Skating', 'Ski Jumping', 'Skeleton', 'Nordic Combined']


2012 0.8150935760673514
	Overall  Accuracy of SVMs: 0.8150935760673514
	Baseline Accuracy of SVMs: 0.8410457501357544
	SVMs  better_years  than BLines: 78.0 out of 302.0 = 0.258278145695

	Sports with all events better_years than baseline:
	['Synchronized Swimming', 'Trampolining']
	Sports with all events worse_years than baseline:
	['Triathlon', 'Tennis', 'Modern Pentathlon', 'Football', 'Beach Volleyball', 'Hockey', 'Handball', 'Water Polo']


2014 0.846471016890049
	Overall  Accuracy of SVMs: 0.846471016890049
	Baseline Accuracy of SVMs: 0.8748817702282319
	SVMs  better_years  than BLines: 20.0 out of 98.0 = 0.204081632653

	Sports with all events better_years than baseline:
	['Ice Hockey', 'Luge']
	Sports with all events worse_years than baseline:
	['Bobsleigh', 'Ski Jumping', 'Alpine Skiing']


2016 0.8255873070535269
	Overall  Accuracy of SVMs: 0.8255873070535269
	Baseline Accuracy of SVMs: 0.8473941213353381
	SVMs  better_years  than BLines: 85.0 out of 302.0 = 0.281456953642

	Sports with all events better_years than baseline:
	['Synchronized Swimming']
	Sports with all events worse_years than baseline:
	['Boxing', 'Modern Pentathlon', 'Football', 'Volleyball', 'Trampolining', 'Hockey', 'Handball', 'Tennis']




In [555]:
years_precisions = defaultdict(float)
for year in predict_years:
    year_precisions = defaultdict(float)
    for sport in accuracies_years[year].keys():
        sport_precisions = defaultdict(float)
        for event in accuracies_years[year][sport].keys():
            sport_precisions[sport] += accuracies_years[year][sport][event][3]
        
        year_precisions[sport] += sum(sport_precisions.values()) / len(accuracies_years[year][sport].keys())
    years_precisions[year] += sum(year_precisions.values()) / len(accuracies_years[year].keys())

years_precisions

defaultdict(float,
            {2000: 0.4036165595829695,
             2002: 0.3483707264957266,
             2004: 0.38474456923633604,
             2006: 0.37200464897647995,
             2008: 0.3994022818384305,
             2010: 0.32198660628988496,
             2012: 0.3941750853147585,
             2014: 0.3414458280967971,
             2016: 0.4002086093494274})

In [556]:
years_precisions = defaultdict(float)
for year in predict_years:
    print year
    year_cf = [[0,0],[0,0]]
    year_precisions = defaultdict(float)
    for sport in accuracies_years[year].keys():
        sport_precisions = defaultdict(float)
        for event in accuracies_years[year][sport].keys():
            sport_precisions[sport] += accuracies_years[year][sport][event][3]
            year_cf += accuracies_years[year][sport][event][2]
        year_precisions[sport] += sum(sport_precisions.values()) / len(accuracies_years[year][sport].keys())
    years_precisions[year] += sum(year_precisions.values()) / len(accuracies_years[year].keys())
    print year_cf
    print

years_precisions

2000
[[10399  1102]
 [ 1102   856]]

2002
[[3278  273]
 [ 273  198]]

2004
[[10261  1138]
 [ 1138   862]]

2006
[[3540  299]
 [ 299  227]]

2008
[[10183  1184]
 [ 1184   851]]

2010
[[3576  287]
 [ 287  228]]

2012
[[9494 1115]
 [1115  800]]

2014
[[3770  333]
 [ 333  237]]

2016
[[9996 1106]
 [1106  830]]



defaultdict(float,
            {2000: 0.4036165595829695,
             2002: 0.3483707264957266,
             2004: 0.38474456923633604,
             2006: 0.37200464897647995,
             2008: 0.3994022818384305,
             2010: 0.32198660628988496,
             2012: 0.3941750853147585,
             2014: 0.3414458280967971,
             2016: 0.4002086093494274})

## Medal Wins Predictor

In [446]:
# print(train_y)

# cells im scared to delete

In [345]:
for sport in sports_inter:
    if(sport != "Gymnastics"):
        continue
    cur_clf = sport_svm[sport]
    tempvalid_df = valid_df[valid_df["Sport"] == sport]
    
    valid_X = [feat(row) for index, row in tempvalid_df.iterrows()]
    if (len(valid_X) == 0):
        continue
    valid_y = tempvalid_df["win"]
    predictions = cur_clf.decision_function(valid_X)
    accur = accuracy_score(valid_y, predictions)
    
    base_y = [0] * len(valid_y)
    base_accur = accuracy_score(valid_y, base_y)
    
    
    avg_accur += accur
    avg_baseacc += base_accur
    print "Accuracy for " + sport + ":\t"
    print str(accur) + " vs baseline: " + str(base_accur) + " "+ str(accur > base_accur) 
    print confusion_matrix(valid_y, predictions) #/ [[len(train_y)]*2]*2
    print precision_score(valid_y, predictions)
    if (accur > base_accur): better+=1
    #print "For " + sport + " better than base: " + str(accur > base_accur)
    
avg_accur = avg_accur / len(sports_inter) 
avg_baseacc = avg_baseacc / len(sports_inter)
print
print avg_accur
print base_accur
print better / len(sports_inter)

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [318]:
sports_inter = list(train_df["Sport"].unique()) or list(valid2000_df["Sport"].unique())
sport_conf =  defaultdict(int)  
avg_accur = 0
avg_baseacc = 0
better = 0.0
for sport in sports_inter:
    if(sport != "Gymnastics"):
        continue
    cur_clf = sport_svm[sport]
    tempvalid2000_df = valid2000_df[valid2000_df["Sport"] == sport]
    
    valid_X = [feat(row) for index, row in tempvalid2000_df.iterrows()]
    if (len(valid_X) == 0):
        continue
    valid_y = tempvalid2000_df["win"]
    predictions = cur_clf.predict(valid_X)
    accur = accuracy_score(valid_y, predictions)
    
    base_y = [0] * len(valid_y)
    base_accur = accuracy_score(valid_y, base_y)
    
    
    avg_accur += accur
    avg_baseacc += base_accur
    print "Accuracy for " + sport + ":\t"
    print str(accur) + " vs baseline: " + str(base_accur) + " "+ str(accur > base_accur) 
    print confusion_matrix(valid_y, predictions) #/ [[len(train_y)]*2]*2
    print precision_score(valid_y, predictions)
    if (accur > base_accur): better+=1
    #print "For " + sport + " better than base: " + str(accur > base_accur)
    
avg_accur = avg_accur / len(sports_inter) 
avg_baseacc = avg_baseacc / len(sports_inter)
print
print avg_accur
print base_accur
print better / len(sports_inter)

Accuracy for Gymnastics:	
0.941747572815534 vs baseline: 0.9382171226831421 True
[[1061    2]
 [  64    6]]
0.75

0.0209277238403452
0.9382171226831421
0.0222222222222


In [311]:
sports_inter = list(train_df["Sport"].unique()) or list(valid_df["Sport"].unique())
sport_conf =  defaultdict(int)  
avg_accur = 0
avg_baseacc = 0
better = 0.0
for sport in sports_inter:
    if(sport != "Gymnastics"):
        continue
    cur_clf = sport_svm[sport]
    tempvalid_df = valid_df[valid_df["Sport"] == sport]
    
    valid_X = [feat(row) for index, row in tempvalid_df.iterrows()]
    if (len(valid_X) == 0):
        continue
    valid_y = tempvalid_df["win"]
    predictions = cur_clf.predict(valid_X)
    accur = accuracy_score(valid_y, predictions)
    
    base_y = [0] * len(valid_y)
    base_accur = accuracy_score(valid_y, base_y)
    
    
    avg_accur += accur
    avg_baseacc += base_accur
    print "Accuracy for " + sport + ":\t"
    print str(accur) + " vs baseline: " + str(base_accur) + " "+ str(accur > base_accur) 
    print confusion_matrix(valid_y, predictions) #/ [[len(train_y)]*2]*2
    print precision_score(valid_y, predictions)
    if (accur > base_accur): better+=1
    #print "For " + sport + " better than base: " + str(accur > base_accur)
    
avg_accur = avg_accur / len(sports_inter) 
avg_baseacc = avg_baseacc / len(sports_inter)
print
print avg_accur
print base_accur
print better / len(sports_inter)

Accuracy for Gymnastics:	
0.9298029556650246 vs baseline: 0.930008210180624 False
[[4524    7]
 [ 335    6]]
0.46153846153846156

0.020662287903667212
0.930008210180624
0.0


In [None]:
sports_train = list(train_df["Sport"].unique())
avg_accur = 0
avg_baseacc = 0
better = 0.0
for sport in sports_train:
    cur_clf = sport_svm[sport]
    temptrain_df = train_df[train_df["Sport"] == sport]
    train_X = [feat(row) for index, row in temptrain_df.iterrows()]
    train_y = temptrain_df["win"]
    predictions = cur_clf.predict(train_X)
    accur = accuracy_score(train_y, predictions)
    
    base_y = [0] * len(train_y)
    base_accur = accuracy_score(train_y, base_y)
    
    avg_accur += accur
    avg_baseacc += base_accur
    print "Accuracy for " + sport + ":\t"
    print str(accur) + " vs baseline: " + str(base_accur) + " "+ str(accur > base_accur)
    print confusion_matrix(train_y, predictions) # / [[[len(train_y)]*2]*2]
    if (accur > base_accur): better+=1
    #print "For " + sport + " better than base: " + str(accur > base_accur)

avg_accur = avg_accur / len(sports_inter) 
avg_baseacc = avg_baseacc / len(sports_inter)
print
print avg_accur
print base_accur
print better / len(sports_inter)