In [1]:
import pandas as pd
import requests
import wikipedia as wp

In [2]:
results = pd.read_excel('Results_Rio_2016.xlsx')
results = pd.DataFrame(results)
results = results.filter(['Sport', 'Discipline', 'Event', 'Phase', 'Names', 'Gender', 'Rank', 'Results'], axis=1)

In [36]:
def reduce_string(string):
    string = string.split('== References ==', 1)[0]
    return string.replace('\n', ' ').replace('\r', '').replace("==", "").replace("=", "")

In [37]:
def count(string):
    counter = 0
    saw_space = False
    for char in string:
        if char == " ":
            if not saw_space:
                counter += 1
            saw_space = True
        else:
            saw_space = False
    return counter

In [38]:
def count_words(name_short, keyword):    
    wordcount = 0
    article = None
    try:
        article = wp.page(name_short)
        
    except wp.PageError as pe:
        print('Page Error:' + name_short)        
        
        article = count_words_long_name(name_short, keyword)
            
    except wp.DisambiguationError as de_1:        
        matching = [s for s in de_1.args[1] if keyword in s]
        string = ''.join(matching)        
        string = string.replace(' ', '_')
        string = string.replace('-', '_')        
        string = string.replace('_(', ' (')
        print(string)
        
        try:            
            article = wp.page(string)
        except wp.DisambiguationError as de_2:
            print('DisambiguationError: ' + str(de_2.args))
        except wp.WikipediaException as we:            
            print('Wikipedia Exception: ' + str(we.args))
            
        except wp.PageError as pe_2:
            print('Page Error: ' + str(pe_2.args))       
                
    if article != None:
        article = article.content        
        wordcount = count(reduce_string(article))
    
    return wordcount

In [39]:
def count_words_long_name(name_short, keyword):
    article = None
    try:
        article = wp.page(name_short + ' (' + keyword + ')')
    except wp.DisambiguationError as de_1:
        print("DisambiguationError:" + name_short)
    except wp.WikipediaException as we:
        print('Wikipedia Exception: ' + name_short)
    except wp.PageError as pe_2:
            print('Page Error: ' + name_short)
    return article

In [40]:
def add_name_column(Frame):
    # add Wiki_Count column with initial value of 1
    Frame['Wiki_Count'] = 0    
    # change order of names to firstName_lastName
    Frame['Names'] = Frame['Names'].str.split(', ').str[1] + ' ' + Frame['Names'].str.split(', ').str[0]
    # add underscores and capitalize first letter of name
    Frame['Names'] = Frame['Names'].str.replace(' ', '_')
    Frame['Names'] = Frame['Names'].str.replace('-', '_')
    Frame['Names'] = Frame['Names'].str.title()
    return Frame

In [41]:
def clean_data(Frame):
    # remove duplicate entries and reset index
    Frame.drop_duplicates(['Names'], keep='first', inplace=True)
    Frame.reset_index(drop=True, inplace = True)
    
    # add column for Wiki_Count and edit names
    Frame = add_name_column(Frame)
    return Frame

In [42]:
def filter_event(discipline, event):
    Frame = results.loc[(results['Discipline'] == discipline)
                             & (results['Event'] == event)]                     
    Frame = pd.DataFrame(Frame)
    Frame = clean_data(Frame)  
       
    return Frame


In [43]:
def get_wikipedia_count(Competition, keyword):    
    for x in range(len(Competition.index)):   
        print(x, end=" ")
        name_short = Competition['Names'].values[x]     
        name_long = name_short + '_({})'.format(keyword)        
        Competition.loc[x,'Wiki_Count'] = count_words(name_short, keyword) 
    print('Done')
    return Competition      


In [49]:
# get diving results and correct name
Athletes_100m = filter_event('Athletics', '100m')

Athletes_100m.loc[Athletes_100m['Names'] == "Richard_Thompson", 'Names'] = "Richard_Thompson (sprinter)"
Athletes_100m.loc[Athletes_100m['Names'] == "Aaron_Brown", 'Names'] = "Aaron_Brown (sprinter)"


# get Wikipedia article count
get_wikipedia_count(Athletes_100m, 'athlete')

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 Adrian_Griffith (athlete)
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 Done


Unnamed: 0,Sport,Discipline,Event,Phase,Names,Gender,Rank,Results,Wiki_Count
0,Athletics,Athletics,100m,Final ranking,Usain_Bolt,Men,1.0,9.81,10628
1,Athletics,Athletics,100m,Final ranking,Justin_Gatlin,Men,2.0,9.89,2445
2,Athletics,Athletics,100m,Final ranking,Andre_De_Grasse,Men,3.0,9.91,1534
3,Athletics,Athletics,100m,Final ranking,Yohan_Blake,Men,4.0,9.93,1805
4,Athletics,Athletics,100m,Final ranking,Akani_Simbine,Men,5.0,9.94,156
5,Athletics,Athletics,100m,Final ranking,Ben_Youssef_Meite,Men,6.0,9.96,69
6,Athletics,Athletics,100m,Final ranking,Jimmy_Vicaut,Men,7.0,10.04,384
7,Athletics,Athletics,100m,Final ranking,Trayvon_Bromell,Men,8.0,10.06,1653
8,Athletics,Athletics,100m,Semifinal,Jak_Ali_Harvey,Men,4.0,10.03,63
9,Athletics,Athletics,100m,Semifinal,Nickel_Ashmeade,Men,5.0,10.05,590


In [44]:
# get diving results and correct name
Diving_10m = filter_event('Diving', '10m platform')
Diving_10m.loc[Diving_10m['Names'] == "Brittany_Obrien", 'Names'] = 'Brittany_O_Brien'

# get Wikipedia article count
get_wikipedia_count(Diving_10m, 'diver')


0 1 Germán_Sánchez (diver)
2 3 4 5 6 7 8 9 Iván_García (diver)
10 11 12 13 14 James_Connor (diver)
15 16 17 18 19 20 21 22 23 24 25 26 27 Page Error:Mohab_Ishak
Wikipedia Exception: Mohab_Ishak
28 29 30 31 32 33 34 35 Kim_Un_hyang (diver)
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 Done


Unnamed: 0,Sport,Discipline,Event,Phase,Names,Gender,Rank,Results,Wiki_Count
0,Aquatics,Diving,10m platform,Final Ranking,Chen_Aisen,Men,1.0,585.3,509
1,Aquatics,Diving,10m platform,Final Ranking,German_Sanchez,Men,2.0,532.7,792
2,Aquatics,Diving,10m platform,Final Ranking,David_Boudia,Men,3.0,525.25,771
3,Aquatics,Diving,10m platform,Final Ranking,Benjamin_Auffret,Men,4.0,507.35,30
4,Aquatics,Diving,10m platform,Final Ranking,Martin_Wolfram,Men,5.0,492.9,103
5,Aquatics,Diving,10m platform,Final Ranking,Bo_Qiu,Men,6.0,488.2,569
6,Aquatics,Diving,10m platform,Final Ranking,Rafael_Quintero,Men,7.0,485.35,281
7,Aquatics,Diving,10m platform,Final Ranking,Victor_Minibaev,Men,8.0,481.6,29
8,Aquatics,Diving,10m platform,Final Ranking,Sascha_Klein,Men,9.0,424.15,68
9,Aquatics,Diving,10m platform,Final Ranking,Ivan_Garcia,Men,10.0,418.95,507


In [12]:
Archery = filter_event('Archery', 'Individual Competition')

# correct names
Archery.loc[Archery['Names'] == "Bonchan_Ku", 'Names'] = 'Ku_Bon_Chan'
Archery.loc[Archery['Names'] == "Seungyun_Lee", 'Names'] = 'Lee_Seung_Yun'
Archery.loc[Archery['Names'] == "Chun_Heng_Wei", 'Names'] = 'Wei_Chun_Heng'
Archery.loc[Archery['Names'] == "Yu_Xing", 'Names'] = 'Xing_Yu'
Archery.loc[Archery['Names'] == "Andres_Pila_Solano", 'Names'] = 'Andres_Pila'
Archery.loc[Archery['Names'] == "Bobae_Ki", 'Names'] = 'Ki_Bo_Bae'
Archery.loc[Archery['Names'] == "Misun_Choi", 'Names'] = 'Choi_Mi_Sun'
Archery.loc[Archery['Names'] == "Hui_Cao", 'Names'] = 'Cao_Hui'
Archery.loc[Archery['Names'] == "Robert_Elder", 'Names'] = 'Rob_Elder'

# get article length
get_wikipedia_count(Archery, 'archer')


0 1 2 3 4 5 6 7 8 9 Antonio_Fernández (archer)
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 Page Error:Ali_El_Gharrari
Wikipedia Exception: Ali_El_Gharrari
51 52 53 54 55 56 57 58 59 60 Arne_Jensen (archer)
61 62 63 64 65 66 67 68 69 70 71 Page Error:Choi_Mi_Sun
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 Veronika_Marchenko (archer)
95 96 97 98 99 100 101 102 103 104 105 Natalia_Sánchez (archer)
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 Yuki_Hayashi (archer)
122 123 124 125 126 127 Done


Unnamed: 0,Sport,Discipline,Event,Phase,Names,Gender,Rank,Results,Wiki_Count
0,Archery,Archery,Individual Competition,Final Ranking,Ku_Bon_Chan,Men,1.0,,223
1,Archery,Archery,Individual Competition,Final Ranking,Jean_Charles_Valladont,Men,2.0,,116
2,Archery,Archery,Individual Competition,Final Ranking,Brady_Ellison,Men,3.0,,385
3,Archery,Archery,Individual Competition,Final Ranking,Sjef_Van_Den_Berg,Men,4.0,,432
4,Archery,Archery,Individual Competition,Final Ranking,Taylor_Worth,Men,5.0,,482
5,Archery,Archery,Individual Competition,Final Ranking,Mauro_Nespoli,Men,6.0,,567
6,Archery,Archery,Individual Competition,Final Ranking,Lee_Seung_Yun,Men,7.0,,191
7,Archery,Archery,Individual Competition,Final Ranking,Takaharu_Furukawa,Men,8.0,,279
8,Archery,Archery,Individual Competition,Final Ranking,Ricardo_Soto,Men,9.0,,119
9,Archery,Archery,Individual Competition,Final Ranking,Antonio_Fernandez,Men,9.0,,329


In [52]:
Pole_Vault = filter_event('Athletics', 'pole vault')
Pole_Vault.loc[Pole_Vault['Names'] == "Augusto_De_Oliveira", 'Names'] = 'Augusto_Dutra_de_Oliveira'
Pole_Vault.loc[Pole_Vault['Names'] == "Mengqian_Ren", 'Names'] = 'Ren_Mengqian'
Pole_Vault.loc[Pole_Vault['Names'] == "Ling_Li", 'Names'] = 'Li_Ling'

get_wikipedia_count(Pole_Vault, 'athlete')

Pole_Vault.loc[Pole_Vault['Names'] == "Ivan_Horvat", 'Wiki_Count'] = count_words('Ivan_Horvat (pole_vaulter)', '')



0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 Paweł_Wojciechowski (athlete)
26 Robert_Renner (athlete)
27 
Wikipedia Exception: ('The "srsearch" parameter must be set.',)
28 29 Logan_Cunningham (athlete)
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 Done


In [14]:
Modern_Pentathlon = filter_event('Modern Pentathlon', 'Individual competition')
Modern_Pentathlon.loc[Modern_Pentathlon['Names'] == "Arthur_Lanigan_Okeeffe", 'Names'] = "Arthur_Lanigan_O'Keeffe"
Modern_Pentathlon.loc[Modern_Pentathlon['Names'] == "Yane_Marcia_Marques", 'Names'] = "Yane Marques"

get_wikipedia_count(Modern_Pentathlon, 'athlete')

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 Chen_Qian (pentathlete)
Done


Unnamed: 0,Sport,Discipline,Event,Phase,Names,Gender,Rank,Results,Wiki_Count
0,Modern Pentathlon,Modern Pentathlon,Individual competition,Final ranking,Alexander_Lesun,Men,1.0,1479,2987
1,Modern Pentathlon,Modern Pentathlon,Individual competition,Final ranking,Pavlo_Tymoshchenko,Men,2.0,1472,63
2,Modern Pentathlon,Modern Pentathlon,Individual competition,Final ranking,Ismael_Hernandez_Uscanga,Men,3.0,1468,546
3,Modern Pentathlon,Modern Pentathlon,Individual competition,Final ranking,Valentin_Prades,Men,4.0,1467,39
4,Modern Pentathlon,Modern Pentathlon,Individual competition,Final ranking,Riccardo_De_Luca,Men,5.0,1467,49
5,Modern Pentathlon,Modern Pentathlon,Individual competition,Final ranking,Patrick_Dogue,Men,6.0,1463,51
6,Modern Pentathlon,Modern Pentathlon,Individual competition,Final ranking,Max_Esposito,Men,7.0,1462,393
7,Modern Pentathlon,Modern Pentathlon,Individual competition,Final ranking,Arthur_Lanigan_O'Keeffe,Men,8.0,1457,72
8,Modern Pentathlon,Modern Pentathlon,Individual competition,Final ranking,David_Svoboda,Men,9.0,1452,84
9,Modern Pentathlon,Modern Pentathlon,Individual competition,Final ranking,Joseph_Choong,Men,10.0,1451,312


In [16]:
Fencing_Epee = filter_event('Fencing', 'épée individual')
Fencing_Epee.loc[Fencing_Epee['Names'] == "Francisco_A._Limardo_Gascon", 'Names'] = "Francisco_Limardo"
Fencing_Epee.loc[Fencing_Epee['Names'] == "Anatolii_Herey", 'Names'] = "Anatoliy_Herey"
Fencing_Epee.loc[Fencing_Epee['Names'] == "Jinsun_Jung", 'Names'] = "Jung_Jin_Sun"

get_wikipedia_count(Fencing_Epee, 'fencer')
Fencing_Epee.loc[Fencing_Epee['Names'] == "Silvio_Fernandez", 'Wiki_Count'] = count_words('Silvio Fernández (fencer born 1979)', '')


0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 Silvio_Fernández (fencer_born_1946)Silvio_Fernández (fencer_born_1979)
Wikipedia Exception: ('silvia fernández fencer_born_1946 silvia fernández fencer_born_1979',)
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 Done


In [20]:
Swimming_100m_freestyle = filter_event('Swimming', '100m freestyle')
Swimming_100m_freestyle.loc[Swimming_100m_freestyle['Names'] == "Arseth_Heather", 'Names'] = "Heather_Arseth"
Swimming_100m_freestyle.loc[Swimming_100m_freestyle['Names'] == "Taehwan_Park", 'Names'] = "Park_Tae_Hwan"

get_wikipedia_count(Swimming_100m_freestyle, 'swimmer')

0 1 2 3 4 Duncan_Scott (swimmer)
5 6 7 8 Vladimir_Morozov (swimmer)
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 Shane_Ryan (swimmer)
29 Aleksandar_Nikolov (swimmer)
30 31 32 33 34 35 Dylan_Carter (swimmer)
36 37 38 39 40 41 42 43 44 45 46 47 48 Marius_Radu (swimmer)
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 Done


Unnamed: 0,Sport,Discipline,Event,Phase,Names,Gender,Rank,Results,Wiki_Count
0,Aquatics,Swimming,100m freestyle,Final Ranking,Kyle_Chalmers,Men,1.0,47.58,384
1,Aquatics,Swimming,100m freestyle,Final Ranking,Pieter_Timmers,Men,2.0,47.80,459
2,Aquatics,Swimming,100m freestyle,Final Ranking,Nathan_Adrian,Men,3.0,47.85,2305
3,Aquatics,Swimming,100m freestyle,Final Ranking,Santo_Condorelli,Men,4.0,47.88,256
4,Aquatics,Swimming,100m freestyle,Final Ranking,Duncan_Scott,Men,5.0,48.01,888
5,Aquatics,Swimming,100m freestyle,Final Ranking,Caeleb_Dressel,Men,6.0,48.02,1177
6,Aquatics,Swimming,100m freestyle,Final Ranking,Cameron_Mcevoy,Men,7.0,48.12,386
7,Aquatics,Swimming,100m freestyle,Final Ranking,Marcelo_Chierighini,Men,8.0,48.41,809
8,Aquatics,Swimming,100m freestyle,Semifinal,Vladimir_Morozov,Men,4.0,48.26,428
9,Aquatics,Swimming,100m freestyle,Semifinal,Sebastiaan_Verschuren,Men,5.0,48.28,229


In [26]:
Cycling_Road = filter_event('Cycling Road', 'individual road race')

Cycling_Road.loc[Cycling_Road['Names'] == "Daniel_Martin", 'Names'] = "Dan_Martin"
Cycling_Road.loc[Cycling_Road['Names'] == "Andrey_Amador_Bakkazakova", 'Names'] = "Andrey_Amador"
Cycling_Road.loc[Cycling_Road['Names'] == "Manuel_Rodas_Ochoa", 'Names'] = "Manuel_Rodas"
Cycling_Road.loc[Cycling_Road['Names'] == "Yousef_Mirza_Banihammad", 'Names'] = "Yousif_Mirza"
Cycling_Road.loc[Cycling_Road['Names'] == "Jose_Luis_Rodriguez", 'Names'] = "José_Luis_Rodríguez_Aguilar"
Cycling_Road.loc[Cycling_Road['Names'] == "Joonyong_Seo", 'Names'] = "Seo_Joon_Yong"
Cycling_Road.loc[Cycling_Road['Names'] == "Okcheol_Kim", 'Names'] = "Kim_Ok_Cheol"
Cycling_Road.loc[Cycling_Road['Names'] == "Nelson_Filipe_S._Simoes_Oliveira", 'Names'] = "Nelson_Oliveira"
Cycling_Road.loc[Cycling_Road['Names'] == "Andrii_Khripta", 'Names'] = "Andriy_Khripta"
Cycling_Road.loc[Cycling_Road['Names'] == "Audrey_Cordon", 'Names'] = "Audrey_Cordon_Ragot"

get_wikipedia_count(Cycling_Road, 'cyclist')
Cycling_Road.loc[Cycling_Road['Names'] == "Alessandro_De_Marchi", 'Wiki_Count'] = count_words('Alessandro_De_Marchi_(Cyclist)', '')
Cycling_Road.loc[Cycling_Road['Names'] == "Dan_Martin", 'Wiki_Count'] = count_words('Dan Martin (cyclist)', '')
Cycling_Road.loc[Cycling_Road['Names'] == "Ann_Sophie_Duyck", 'Wiki_Count'] = count_words('Ann_Sophie_Duyck', '')
Cycling_Road.loc[Cycling_Road['Names'] == "Zac_Williams", 'Wiki_Count'] = 80




0 1 2 3 4 5 6 7 8 9 10 11 12 
Wikipedia Exception: ('The "srsearch" parameter must be set.',)
13 14 Adam_Yates (cyclist)
15 16 17 18 19 20 21 22 23 24 Simon_Clarke (cyclist)
25 26 27 28 29 30 31 32 George_Bennett (cyclist)
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 Michael_Woods (cyclist)
55 56 57 58 59 60 61 62 Alessandro_De_Marchi (cyclist)
Wikipedia Exception: ('alessandro de marchi cycling',)
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 Diego_Rosa (cyclist)
82 83 84 85 86 87 88 89 90 91 92 93 94 95 Daniel_Díaz (cyclist)
96 97 98 99 100 101 102 103 104 105 106 Stefan_Hristov (cyclist)
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 Tony_Martin (cyclist)
124 125 126 127 Page Error:Zac_Williams
Wikipedia Exception: Zac_Williams
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 17

In [56]:
writer = pd.ExcelWriter('results_with_wiki_count.xlsx')

Diving_10m.to_excel(writer, 'Diving_10m')
Athletes_100m.to_excel(writer, 'Athletes_100m')
Archery.to_excel(writer, 'Archery')
Pole_Vault.to_excel(writer, 'Pole_Vault')
Modern_Pentathlon.to_excel(writer, 'Modern_Pentathlon')
Fencing_Epee.to_excel(writer, 'Fencing Epee')
Swimming_100m_freestyle.to_excel(writer, 'Swimming_100m_Freestyle')
Cycling_Road.to_excel(writer, 'Cycling_Road')
writer.save()