In [None]:
import pandas as pd
import requests
import wikipedia as wp

In [None]:
results = pd.read_excel('Results_Rio_2016.xlsx')
results = pd.DataFrame(results)
results = results.filter(['Sport', 'Discipline', 'Event', 'Phase', 'Names', 'Gender', 'Rank', 'Results'], axis=1)

In [None]:
def reduce_string(string):
    string = string.split('== References ==', 1)[0]
    return string.replace('\n', ' ').replace('\r', '').replace("==", "").replace("=", "")

In [None]:
def count(string):
    counter = 0
    saw_space = False
    for char in string:
        if char == " ":
            if not saw_space:
                counter += 1
            saw_space = True
        else:
            saw_space = False
    return counter

In [None]:
def count_words(name_short, keyword):    
    wordcount = 0
    article = None
    try:
        article = wp.page(name_short)
        
    except wp.PageError as pe:
        print('Page Error:' + name_short)        
        
        article = count_words_long_name(name_short, keyword)
            
    except wp.DisambiguationError as de_1:        
        matching = [s for s in de_1.args[1] if keyword in s]
        string = ''.join(matching)        
        string = string.replace(' ', '_')
        string = string.replace('-', '_')        
        string = string.replace('_(', ' (')
        print(string)
        
        try:            
            article = wp.page(string)
        except wp.DisambiguationError as de_2:
            print('DisambiguationError: ' + str(de_2.args))
        except wp.WikipediaException as we:            
            print('Wikipedia Exception: ' + str(we.args))
            
        except wp.PageError as pe_2:
            print('Page Error: ' + str(pe_2.args))       
                
    if article != None:
        article = article.content        
        wordcount = count(reduce_string(article))
    
    return wordcount

In [None]:
def count_words_long_name(name_short, keyword):
    article = None
    try:
        article = wp.page(name_short + ' (' + keyword + ')')
    except wp.DisambiguationError as de_1:
        print("DisambiguationError:" + name_short)
    except wp.WikipediaException as we:
        print('Wikipedia Exception: ' + name_short)
    except wp.PageError as pe_2:
            print('Page Error: ' + name_short)
    return article

In [None]:
def add_name_column(Frame):
    # add Wiki_Count column with initial value of 1
    Frame['Wiki_Count'] = 0    
    # change order of names to firstName_lastName
    Frame['Names'] = Frame['Names'].str.split(', ').str[1] + ' ' + Frame['Names'].str.split(', ').str[0]
    # add underscores and capitalize first letter of name
    Frame['Names'] = Frame['Names'].str.replace(' ', '_')
    Frame['Names'] = Frame['Names'].str.replace('-', '_')
    Frame['Names'] = Frame['Names'].str.title()
    return Frame

In [None]:
def clean_data(Frame):
    # remove duplicate entries and reset index
    Frame.drop_duplicates(['Names'], keep='first', inplace=True)
    Frame.reset_index(drop=True, inplace = True)
    
    # add column for Wiki_Count and edit names
    Frame = add_name_column(Frame)
    return Frame

In [None]:
def filter_event(discipline, event):
    Frame = results.loc[(results['Discipline'] == discipline)
                             & (results['Event'] == event)]                     
    Frame = pd.DataFrame(Frame)
    Frame = clean_data(Frame)  
       
    return Frame


In [None]:
def get_wikipedia_count(Competition, keyword):    
    for x in range(len(Competition.index)):   
        print(x, end=" ")
        name_short = Competition['Names'].values[x]     
        name_long = name_short + '_({})'.format(keyword)        
        Competition.loc[x,'Wiki_Count'] = count_words(name_short, keyword) 
    print('Done')
    return Competition      


In [None]:
# get diving results and correct name
Athletes_100m = filter_event('Athletics', '100m')

Athletes_100m.loc[Athletes_100m['Names'] == "Richard_Thompson", 'Names'] = "Richard_Thompson (sprinter)"
Athletes_100m.loc[Athletes_100m['Names'] == "Aaron_Brown", 'Names'] = "Aaron_Brown (sprinter)"


# get Wikipedia article count
get_wikipedia_count(Athletes_100m, 'athlete')

In [None]:
# get diving results and correct name
Diving_10m = filter_event('Diving', '10m platform')
Diving_10m.loc[Diving_10m['Names'] == "Brittany_Obrien", 'Names'] = 'Brittany_O_Brien'

# get Wikipedia article count
get_wikipedia_count(Diving_10m, 'diver')


In [None]:
Archery = filter_event('Archery', 'Individual Competition')

# correct names
Archery.loc[Archery['Names'] == "Bonchan_Ku", 'Names'] = 'Ku_Bon_Chan'
Archery.loc[Archery['Names'] == "Seungyun_Lee", 'Names'] = 'Lee_Seung_Yun'
Archery.loc[Archery['Names'] == "Chun_Heng_Wei", 'Names'] = 'Wei_Chun_Heng'
Archery.loc[Archery['Names'] == "Yu_Xing", 'Names'] = 'Xing_Yu'
Archery.loc[Archery['Names'] == "Andres_Pila_Solano", 'Names'] = 'Andres_Pila'
Archery.loc[Archery['Names'] == "Bobae_Ki", 'Names'] = 'Ki_Bo_Bae'
Archery.loc[Archery['Names'] == "Misun_Choi", 'Names'] = 'Choi_Mi_Sun'
Archery.loc[Archery['Names'] == "Hui_Cao", 'Names'] = 'Cao_Hui'
Archery.loc[Archery['Names'] == "Robert_Elder", 'Names'] = 'Rob_Elder'

# get article length
get_wikipedia_count(Archery, 'archer')


In [None]:
Pole_Vault = filter_event('Athletics', 'pole vault')
Pole_Vault.loc[Pole_Vault['Names'] == "Augusto_De_Oliveira", 'Names'] = 'Augusto_Dutra_de_Oliveira'
Pole_Vault.loc[Pole_Vault['Names'] == "Mengqian_Ren", 'Names'] = 'Ren_Mengqian'
Pole_Vault.loc[Pole_Vault['Names'] == "Ling_Li", 'Names'] = 'Li_Ling'

get_wikipedia_count(Pole_Vault, 'athlete')

Pole_Vault.loc[Pole_Vault['Names'] == "Ivan_Horvat", 'Wiki_Count'] = count_words('Ivan_Horvat (pole_vaulter)', '')



In [None]:
Modern_Pentathlon = filter_event('Modern Pentathlon', 'Individual competition')
Modern_Pentathlon.loc[Modern_Pentathlon['Names'] == "Arthur_Lanigan_Okeeffe", 'Names'] = "Arthur_Lanigan_O'Keeffe"
Modern_Pentathlon.loc[Modern_Pentathlon['Names'] == "Yane_Marcia_Marques", 'Names'] = "Yane Marques"

get_wikipedia_count(Modern_Pentathlon, 'athlete')

In [None]:
Fencing_Epee = filter_event('Fencing', 'épée individual')
Fencing_Epee.loc[Fencing_Epee['Names'] == "Francisco_A._Limardo_Gascon", 'Names'] = "Francisco_Limardo"
Fencing_Epee.loc[Fencing_Epee['Names'] == "Anatolii_Herey", 'Names'] = "Anatoliy_Herey"
Fencing_Epee.loc[Fencing_Epee['Names'] == "Jinsun_Jung", 'Names'] = "Jung_Jin_Sun"

get_wikipedia_count(Fencing_Epee, 'fencer')
Fencing_Epee.loc[Fencing_Epee['Names'] == "Silvio_Fernandez", 'Wiki_Count'] = count_words('Silvio Fernández (fencer born 1979)', '')


In [None]:
Swimming_100m_freestyle = filter_event('Swimming', '100m freestyle')
Swimming_100m_freestyle.loc[Swimming_100m_freestyle['Names'] == "Arseth_Heather", 'Names'] = "Heather_Arseth"
Swimming_100m_freestyle.loc[Swimming_100m_freestyle['Names'] == "Taehwan_Park", 'Names'] = "Park_Tae_Hwan"

get_wikipedia_count(Swimming_100m_freestyle, 'swimmer')

In [None]:
Cycling_Road = filter_event('Cycling Road', 'individual road race')

Cycling_Road.loc[Cycling_Road['Names'] == "Daniel_Martin", 'Names'] = "Dan_Martin"
Cycling_Road.loc[Cycling_Road['Names'] == "Andrey_Amador_Bakkazakova", 'Names'] = "Andrey_Amador"
Cycling_Road.loc[Cycling_Road['Names'] == "Manuel_Rodas_Ochoa", 'Names'] = "Manuel_Rodas"
Cycling_Road.loc[Cycling_Road['Names'] == "Yousef_Mirza_Banihammad", 'Names'] = "Yousif_Mirza"
Cycling_Road.loc[Cycling_Road['Names'] == "Jose_Luis_Rodriguez", 'Names'] = "José_Luis_Rodríguez_Aguilar"
Cycling_Road.loc[Cycling_Road['Names'] == "Joonyong_Seo", 'Names'] = "Seo_Joon_Yong"
Cycling_Road.loc[Cycling_Road['Names'] == "Okcheol_Kim", 'Names'] = "Kim_Ok_Cheol"
Cycling_Road.loc[Cycling_Road['Names'] == "Nelson_Filipe_S._Simoes_Oliveira", 'Names'] = "Nelson_Oliveira"
Cycling_Road.loc[Cycling_Road['Names'] == "Andrii_Khripta", 'Names'] = "Andriy_Khripta"
Cycling_Road.loc[Cycling_Road['Names'] == "Audrey_Cordon", 'Names'] = "Audrey_Cordon_Ragot"

get_wikipedia_count(Cycling_Road, 'cyclist')
Cycling_Road.loc[Cycling_Road['Names'] == "Alessandro_De_Marchi", 'Wiki_Count'] = count_words('Alessandro_De_Marchi_(Cyclist)', '')
Cycling_Road.loc[Cycling_Road['Names'] == "Dan_Martin", 'Wiki_Count'] = count_words('Dan Martin (cyclist)', '')
Cycling_Road.loc[Cycling_Road['Names'] == "Ann_Sophie_Duyck", 'Wiki_Count'] = count_words('Ann_Sophie_Duyck', '')
Cycling_Road.loc[Cycling_Road['Names'] == "Zac_Williams", 'Wiki_Count'] = 80




In [None]:
writer = pd.ExcelWriter('results_with_wiki_count.xlsx')

Diving_10m.to_excel(writer, 'Diving_10m')
Athletes_100m.to_excel(writer, 'Athletes_100m')
Archery.to_excel(writer, 'Archery')
Pole_Vault.to_excel(writer, 'Pole_Vault')
Modern_Pentathlon.to_excel(writer, 'Modern_Pentathlon')
Fencing_Epee.to_excel(writer, 'Fencing_Epee')
Swimming_100m_freestyle.to_excel(writer, 'Swimming_100m_Freestyle')
Cycling_Road.to_excel(writer, 'Cycling_Road')
writer.save()