In [None]:
import pandas as pd
import requests
import wikipedia as wp
import csv
from urllib.parse import unquote
from unidecode import unidecode

In [None]:
# read wiki links and convert to strings
wiki_links = pd.read_csv('wiki_links.csv').filter(['URL'])
wiki_links['URL'] = wiki_links.URL.astype(str)
wiki_links = wiki_links['URL'].tolist()
wiki_links = [unquote(element) for element in wiki_links]

wiki_dict = dict([ (element, unidecode(element)) for element in wiki_links ])

for i in wiki_dict.keys():
    wiki_dict[i] = wiki_dict[i].lower()

In [None]:
def clean_data(df):
    # remove duplicate entries
    # reset index
    # add columns for name and URL
    df.drop_duplicates(['Names'], keep='first', inplace=True)
    df.reset_index(drop=True, inplace = True)
    df = add_name_url_columns(df)    
    return df

In [None]:
# iterate over results data frame and find URL in wiki_links corresponding to athelete's name
# add URL to results data frame
def get_wiki_links(df):
    for x in range(len(df)):
        name = df['Names'].values[x]
        name_list = name.split()
        link = find_link(name_list)
        if(len(link) == 1):
            name_str = ' '.join(link)
            name_str = name_str[name_str.rindex('/') + 1:]        
            df.loc[x, 'URL'] = name_str
    print('Done')

In [None]:
def find_link(name_list):
    link = None
    link = [ key for key,value in wiki_dict.items() if all(c in value for c in name_list) ]    
    return link
    

In [None]:
def reduce_string(string):
    string = string.split('== References ==', 1)[0]
    return string.replace('\n', ' ').replace('\r', '').replace("==", "").replace("=", "")

In [None]:
def count(string):
    counter = 0
    saw_space = False
    for char in string:
        if char == " ":
            if not saw_space:
                counter += 1
            saw_space = True
        else:
            saw_space = False
    return counter

In [None]:
def count_words(wiki_title):    
    wordcount = 0
    article = None
    if wiki_title is not None:
        wiki_title = wiki_title.replace('-', '_')
        wiki_title = wiki_title.replace('_(', ' (')
        try:
            article = wp.page(wiki_title)
        
        except wp.PageError as pe:
            wiki_title = wiki_title.replace('_', ' ')
            try:
                article = wp.page(wiki_title)
            except wp.PageError as pe_2:
                print('Page Error: ' + wiki_title)   
                
            except wp.DisambiguationError as de:        
                print('DisambiguationError' + wiki_title) 
            
        except wp.DisambiguationError as de:        
            print('DisambiguationError' + wiki_title)  
                  
    if article != None:
        article = article.content        
        wordcount = count(reduce_string(article))
    
    return wordcount

In [None]:
def add_name_url_columns(df):
    
    df['Wiki_Count'] = 0 # add Wiki_Count column with initial value of 0
    df['URL'] = None # add URL column
    
    df['Names'] = df['Names'].str.replace(',', '')
    df['Names'] = df['Names'].str.replace('-', ' ')
    df['Names'] = df['Names'].str.lower()
    return df

In [None]:
def filter_event(discipline, event):
    Frame = results.loc[(results['Discipline'] == discipline)
                             & (results['Event'] == event)]                     
    Frame = pd.DataFrame(Frame)
    Frame = clean_data(Frame)  
       
    return Frame


In [None]:
def get_wikipedia_count(Competition, keyword):    
    for x in range(len(Competition.index)):   
        print(x, end=" ")
        #name_short = Competition['Names'].values[x]     
        #name_long = name_short + '_({})'.format(keyword)        
        Competition.loc[x,'Wiki_Count'] = count_words(Competition['URL'].values[x]) 
    print('Done')
    return Competition      


In [None]:
# read results from Olympic Games in Rio 2016 from Excel to data frame
# filter results data frame to relevant columns
results = pd.read_excel('Results_Rio_2016.xlsx')
results = results.filter(['Sport', 'Discipline', 'Event', 'Phase', 'Names', 'Gender', 'Rank', 'Results'], axis=1)

In [None]:
# get diving results and correct name
Athletes_100m = filter_event('Athletics', '100m')
get_wiki_links(Athletes_100m)

Athletes_100m.loc[Athletes_100m['Names'] == "cambridge aska", 'URL'] = "Asuka_Cambridge"
Athletes_100m.loc[Athletes_100m['Names'] == "ismail md fakhri", 'URL'] = "Mohamed_Fakhri_Ismail"
Athletes_100m.loc[Athletes_100m['Names'] == "al harthi barakat mubarak", 'URL'] = "Barakat_Al-Harthi"
Athletes_100m.loc[Athletes_100m['Names'] == "kim kukyoung", 'URL'] = "Kim_Kuk-young"
Athletes_100m.loc[Athletes_100m['Names'] == "kitson kapririel", 'URL'] = "Kitson_Kapiriel"
Athletes_100m.loc[Athletes_100m['Names'] == "povkh olesya", 'URL'] = "Olesya_Povh"
Athletes_100m.loc[Athletes_100m['Names'] == "pohrebniak natalia", 'URL'] = "Nataliya_Pohrebnyak"
Athletes_100m.loc[Athletes_100m['Names'] == "bazolo lorene dorcas", 'URL'] = "Lorène_Bazolo"
Athletes_100m.loc[Athletes_100m['Names'] == "seavula sisila", 'URL'] = "Sisilia_Seavula"
Athletes_100m.loc[Athletes_100m['Names'] == "stuy khrystyna", 'URL'] = "Hrystyna_Stuy"
Athletes_100m.loc[Athletes_100m['Names'] == "bouele cecilia", 'URL'] = "Marcelle_Bouele_Bondo"
Athletes_100m.loc[Athletes_100m['Names'] == "al alawi mazoon", 'URL'] = "Mazoon Al Alawi"

Athletes_100m.loc[Athletes_100m['Names'] == "fisher andrew", 'URL'] = "Andrew Fisher (sprinter)"
Athletes_100m.loc[Athletes_100m['Names'] == "ogunlewe seye", 'URL'] = "Seye Ogunlewe (athlete)"

# get Wikipedia article count
get_wikipedia_count(Athletes_100m, 'athlete')


In [None]:
# get diving results and correct name
Diving_10m = filter_event('Diving', '10m platform')
get_wiki_links(Diving_10m)

Diving_10m.loc[Diving_10m['Names'] == "minibaev victor", 'URL'] = "Viktor_Minibaev"
Diving_10m.loc[Diving_10m['Names'] == "ren qian", 'URL'] = "Ren_Qian"
Diving_10m.loc[Diving_10m['Names'] == "woo haram", 'URL'] = "Woo_Ha-ram"
Diving_10m.loc[Diving_10m['Names'] == "pamg pandelela rinong", 'URL'] = "Pandelela_Rinong"
Diving_10m.loc[Diving_10m['Names'] == "prokopchuk iuliia", 'URL'] = "Yulia_Prokopchuk"
Diving_10m.loc[Diving_10m['Names'] == "obrien brittany", 'URL'] = "Brittany_O'Brien"
Diving_10m.loc[Diving_10m['Names'] == "krasnoshlyk ganna", 'URL'] = "Hanna_Krasnoshlyk"
Diving_10m.loc[Diving_10m['Names'] == "abdel salam maha", 'URL'] = "Maha_Gouda"
Diving_10m.loc[Diving_10m['Names'] == "kim kuk hyang", 'URL'] = "Kim_Kuk-hyang_(diver)"
Diving_10m.loc[Diving_10m['Names'] == "quintero rafael", 'URL'] = "Rafael Quintero (diver)"
Diving_10m.loc[Diving_10m['Names'] == "riendeau vincent", 'URL'] = "Vincent Riendeau (diver)"


# get Wikipedia article count
get_wikipedia_count(Diving_10m, 'diver')

Diving_10m.loc[Diving_10m['Names'] == "kim kuk hyang", 'Wiki_Count'] = 125


In [None]:
Archery = filter_event('Archery', 'Individual Competition')

# get article length
get_wiki_links(Archery)

Archery.loc[Archery['Names'] == "ku bonchan", 'URL'] = "Ku_Bon-chan"
Archery.loc[Archery['Names'] == "lee seungyun", 'URL'] = "Lee_Seung-yun"
Archery.loc[Archery['Names'] == "rodriguez liebana juan ignacio", 'URL'] = "Juan_Ignacio_Rodríguez"
Archery.loc[Archery['Names'] == "puentes perez adrian andres", 'URL'] = "Adrián_Puentes"
Archery.loc[Archery['Names'] == "alvarino garcia miguel", 'URL'] = "Miguel_Alvariño"
Archery.loc[Archery['Names'] == "kim woojin", 'URL'] = "Kim_Woo-jin"
Archery.loc[Archery['Names'] == "nesteng baard", 'URL'] = "Bård_Nesteng"
Archery.loc[Archery['Names'] == "prilepov anton", 'URL'] = "Anton_Prylepau"
Archery.loc[Archery['Names'] == "rezende xavier daniel", 'URL'] = "Daniel_Xavier"
Archery.loc[Archery['Names'] == "dalmeida marcus", 'URL'] = "Marcus_Vinicius_D'Almeida"
Archery.loc[Archery['Names'] == "kouassi rene philippe", 'URL'] = "Philippe_Kouassi"
Archery.loc[Archery['Names'] == "pila solano andres", 'URL'] = "Andrés_Pila"
Archery.loc[Archery['Names'] == "elder robert", 'URL'] = "Rob_Elder"
Archery.loc[Archery['Names'] == "nor hasrin muhammad akmal", 'URL'] = "Akmal_Nor_Hasrin"
Archery.loc[Archery['Names'] == "gantugs jantsan", 'URL'] = "Jantsangiin_Gantögs"
Archery.loc[Archery['Names'] == "muktan jitbahadur", 'URL'] = "Jit_Bahadur_Muktan"
Archery.loc[Archery['Names'] == "sutherland gavin ben", 'URL'] = "Gavin_Sutherland_(archer)"
Archery.loc[Archery['Names'] == "chang hyejin", 'URL'] = "Chang_Hye-jin"
Archery.loc[Archery['Names'] == "ki bobae", 'URL'] = "Ki_Bo-bae"
Archery.loc[Archery['Names'] == "choi misun", 'URL'] = "Choi_Mi-sun"
Archery.loc[Archery['Names'] == "dashidorzhieva tuiana", 'URL'] = "Tuyana_Dashidorzhieva"
Archery.loc[Archery['Names'] == "karma karma", 'URL'] = "Karma_(archer)"
Archery.loc[Archery['Names'] == "thiffeault picard georcy", 'URL'] = "Georcy-Stéphanie_Picard"
Archery.loc[Archery['Names'] == "rendon ana maria", 'URL'] = "Ana_Rendón"
Archery.loc[Archery['Names'] == "camilo gonzalez yessica", 'URL'] = "Yessica_Camilo"
Archery.loc[Archery['Names'] == "lipiarska palka karina", 'URL'] = "Karina_Lipiarska"

# get wikipedia count
get_wikipedia_count(Archery, 'archer')

In [None]:
Pole_Vault = filter_event('Athletics', 'pole vault')
get_wiki_links(Pole_Vault)

Pole_Vault.loc[Pole_Vault['Names'] == "filippidis konstadinos", 'URL'] = "Konstantinos_Filippidis"
Pole_Vault.loc[Pole_Vault['Names'] == "svard jacobsson melker", 'URL'] = "Melker_Svärd_Jacobsson"
Pole_Vault.loc[Pole_Vault['Names'] == "stefanidi ekaterini", 'URL'] = "Katerina_Stefanidi"
Pole_Vault.loc[Pole_Vault['Names'] == "bleasdale holly", 'URL'] = "Holly_Bradshaw"
Pole_Vault.loc[Pole_Vault['Names'] == "suhr jennifer", 'URL'] = "Jenn_Suhr"
Pole_Vault.loc[Pole_Vault['Names'] == "peinado robeilys", 'URL'] = "Robeilys_Peinado"
Pole_Vault.loc[Pole_Vault['Names'] == "li ling", 'URL'] = "Li_Ling_(pole_vaulter)"
Pole_Vault.loc[Pole_Vault['Names'] == "yakaltsevich iryna", 'URL'] = "Iryna_Zhuk"
Pole_Vault.loc[Pole_Vault['Names'] == "kyriakopoulou nikoleta", 'URL'] = "Nikoleta_Kyriakopoulou"

# get wikipedia count
get_wikipedia_count(Pole_Vault, 'athlete')



In [None]:
Modern_Pentathlon = filter_event('Modern Pentathlon', 'Individual competition')

get_wiki_links(Modern_Pentathlon)

Modern_Pentathlon.loc[Modern_Pentathlon['Names'] == "lesun alexander", 'URL'] = "Aleksander_Lesun"
Modern_Pentathlon.loc[Modern_Pentathlon['Names'] == "hernandez uscanga ismael", 'URL'] = "Ismael_Hernández_(pentathlete)"
Modern_Pentathlon.loc[Modern_Pentathlon['Names'] == "lanigan okeeffe arthur", 'URL'] = "Arthur_Lanigan-O'Keeffe"
Modern_Pentathlon.loc[Modern_Pentathlon['Names'] == "choong joseph", 'URL'] = "Joe_Choong"
Modern_Pentathlon.loc[Modern_Pentathlon['Names'] == "jung jinhwa", 'URL'] = "Jung_Jin-hwa"
Modern_Pentathlon.loc[Modern_Pentathlon['Names'] == "cooke jamie", 'URL'] = "James_Cooke_(pentathlete)"
Modern_Pentathlon.loc[Modern_Pentathlon['Names'] == "jun woongtae", 'URL'] = "Jun_Woong-tae"
Modern_Pentathlon.loc[Modern_Pentathlon['Names'] == "nakonechnyi ruslans", 'URL'] = "Ruslans_Nakoņečnijs"
Modern_Pentathlon.loc[Modern_Pentathlon['Names'] == "iliashenko pavel", 'URL'] = "Pavel_Ilyashenko"
Modern_Pentathlon.loc[Modern_Pentathlon['Names'] == "kim sunwoo", 'URL'] = "Kim_Sun-woo_(pentathlete)"
Modern_Pentathlon.loc[Modern_Pentathlon['Names'] == "marques yane marcia", 'URL'] = "Yane_Marques"
Modern_Pentathlon.loc[Modern_Pentathlon['Names'] == "moya leydi laura", 'URL'] = "Leydi_Moya"


get_wikipedia_count(Modern_Pentathlon, 'athlete')

In [None]:
Fencing_Epee = filter_event('Fencing', 'épée individual')
get_wiki_links(Fencing_Epee)

Fencing_Epee.loc[Fencing_Epee['Names'] == "park sangyoung", 'URL'] = "Park_Sang-young"
Fencing_Epee.loc[Fencing_Epee['Names'] == "nikishin bogdan", 'URL'] = "Bohdan_Nikishyn"
Fencing_Epee.loc[Fencing_Epee['Names'] == "limardo gascon francisco a.", 'URL'] = "Francisco_Limardo"
Fencing_Epee.loc[Fencing_Epee['Names'] == "park kyoungdoo", 'URL'] = "Park_Kyoung-doo"
Fencing_Epee.loc[Fencing_Epee['Names'] == "limardo gascon ruben", 'URL'] = "Rubén_Limardo"
Fencing_Epee.loc[Fencing_Epee['Names'] == "herey anatolii", 'URL'] = "Anatoliy_Herey"
Fencing_Epee.loc[Fencing_Epee['Names'] == "rodriguez john edison", 'URL'] = "Jhon_Édison_Rodríguez"
Fencing_Epee.loc[Fencing_Epee['Names'] == "jung jinsun", 'URL'] = "Jung_Jin-sun"
Fencing_Epee.loc[Fencing_Epee['Names'] == "alshatti abdulaziz", 'URL'] = "Abdulaziz_Al-Shatti"
Fencing_Epee.loc[Fencing_Epee['Names'] == "choi injeong", 'URL'] = "Choi_In-jeong"
Fencing_Epee.loc[Fencing_Epee['Names'] == "nakano nozomi", 'URL'] = "Nozomi_Satō"
Fencing_Epee.loc[Fencing_Epee['Names'] == "branza ana maria", 'URL'] = "Ana_Maria_Popescu"
Fencing_Epee.loc[Fencing_Epee['Names'] == "kong man wai vivian", 'URL'] = "Vivian_Kong"

get_wikipedia_count(Fencing_Epee, 'fencer')


In [None]:
Swimming_100m_freestyle = filter_event('Swimming', '100m freestyle')

get_wiki_links(Swimming_100m_freestyle)

Swimming_100m_freestyle.loc[Swimming_100m_freestyle['Names'] == "perez urena jhonny", 'URL'] = "Jhonny_Pérez"
Swimming_100m_freestyle.loc[Swimming_100m_freestyle['Names'] == "gunn sean michael", 'URL'] = "Sean_Gunn_(swimmer)"
Swimming_100m_freestyle.loc[Swimming_100m_freestyle['Names'] == "hockin brusquetti benjamin", 'URL'] = "Ben_Hockin"
Swimming_100m_freestyle.loc[Swimming_100m_freestyle['Names'] == "park taehwan", 'URL'] = "Park_Tae-hwan"
Swimming_100m_freestyle.loc[Swimming_100m_freestyle['Names'] == "gkolomeev kristian", 'URL'] = "Kristian_Golomeev"
Swimming_100m_freestyle.loc[Swimming_100m_freestyle['Names'] == "nilo nicolas", 'URL'] = "Nicolas_Oliveira"
Swimming_100m_freestyle.loc[Swimming_100m_freestyle['Names'] == "riveros schulz karen", 'URL'] = "Karen_Riveros"
Swimming_100m_freestyle.loc[Swimming_100m_freestyle['Names'] == "popova veronika", 'URL'] = "Veronika_Andrusenko"
Swimming_100m_freestyle.loc[Swimming_100m_freestyle['Names'] == "lovtcova natalia", 'URL'] = "Nataliya_Lovtsova"

get_wikipedia_count(Swimming_100m_freestyle, 'swimmer')

In [None]:
Cycling_Road = filter_event('Cycling Road', 'individual road race')

Cycling_Road_Men = Cycling_Road.loc[Cycling_Road['Gender'] == 'Men'] # first 30 rows of dataframe
Cycling_Road_Women = Cycling_Road.loc[Cycling_Road['Gender'] == 'Women']

Cycling_Road_Men = Cycling_Road_Men.iloc[0:30]
Cycling_Road_Women = Cycling_Road_Women.iloc[0:30]

Cycling_Road = Cycling_Road_Men.append(Cycling_Road_Women)
Cycling_Road.reset_index(drop=True, inplace = True)

Cycling_Road.loc[Cycling_Road['Names'] == "rodriguez oliver joaquim", 'URL'] = "Joaquim_Rodríguez"
Cycling_Road.loc[Cycling_Road['Names'] == "costa rui alberto faria", 'URL'] = "Rui_Costa_(cyclist)"
Cycling_Road.loc[Cycling_Road['Names'] == "froome christopher", 'URL'] = "Chris_Froome"
Cycling_Road.loc[Cycling_Road['Names'] == "chaves rubio jhoan esteban", 'URL'] = "Esteban_Chaves"
Cycling_Road.loc[Cycling_Road['Names'] == "valverde belmonte alejandro", 'URL'] = "Alejandro_Valverde"
Cycling_Road.loc[Cycling_Road['Names'] == "armitstead elizabeth", 'URL'] = "Lizzie_Deignan"
Cycling_Road.loc[Cycling_Road['Names'] == "moolman pasio ashleigh", 'URL'] = "Ashleigh_Moolman"
Cycling_Road.loc[Cycling_Road['Names'] == "jasinska malgorzta", 'URL'] = "Małgorzata_Jasińska"
Cycling_Road.loc[Cycling_Road['Names'] == "sierra canadilla arlenis", 'URL'] = "Arlenis_Sierra"
Cycling_Road.loc[Cycling_Road['Names'] == "na ahreum", 'URL'] = "Na_Ah-reum"

get_wiki_links(Cycling_Road)

get_wikipedia_count(Cycling_Road, 'cyclist')


In [None]:
writer = pd.ExcelWriter('results_with_wiki_count.xlsx')

Diving_10m.to_excel(writer, 'Diving_10m')
Athletes_100m.to_excel(writer, 'Athletes_100m')
Archery.to_excel(writer, 'Archery')
Pole_Vault.to_excel(writer, 'Pole_Vault')
Modern_Pentathlon.to_excel(writer, 'Modern_Pentathlon')
Fencing_Epee.to_excel(writer, 'Fencing_Epee')
Swimming_100m_freestyle.to_excel(writer, 'Swimming_100m_Freestyle')
Cycling_Road.to_excel(writer, 'Cycling_Road')
writer.save()