In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# split data into 8 different arrays
Diving_10m = pd.read_excel('results_with_wiki_count.xlsx', sheet_name = 'Diving_10m')
Archery = pd.read_excel('results_with_wiki_count.xlsx', sheet_name = 'Archery')
Fencing_Epee = pd.read_excel('results_with_wiki_count.xlsx', sheet_name = 'Fencing_Epee')
Modern_Pentathlon = pd.read_excel('results_with_wiki_count.xlsx', sheet_name = 'Modern_Pentathlon')
Pole_Vault = pd.read_excel('results_with_wiki_count.xlsx', sheet_name = 'Pole_Vault')
Swimming_100m_Freestyle = pd.read_excel('results_with_wiki_count.xlsx', sheet_name = 'Swimming_100m_Freestyle')
Cycling_Road = pd.read_excel('results_with_wiki_count.xlsx', sheet_name = 'Cycling_Road')
Athletes_100m = pd.read_excel('results_with_wiki_count.xlsx', sheet_name = 'Athletes_100m')

In [None]:
def plot_means(y):
    x = range(len(y))
    labels = ["Women", "Men"]
    width = 1/2
    plt.bar(x, y, width, color="blue")
    plt.xticks(x, labels)

In [None]:
def plot_median(y):
    x = range(len(y))
    labels = ["Women", "Men"]
    width = 1/2
    plt.bar(x, y, width, color="red")
    plt.xticks(x, labels)

In [None]:
# t-test for male and female participants in athletics_100_m finals
#stats.ttest_ind(males, females)

def ttest(sportart,kategorie):
    '''conducts a t test for given competition
    
    Args:
        competition: olympic sports competition to evaluate
    
    Returns:
        statistic, pvalue : The calculated t-statistic and the two-tailed p-value.
    '''
    kat={'words':4,'edits':5,'links':6}
    k=kat[kategorie]
    
    # create subgroups of male and female atheletes
    men = sportart.loc[sportart['Gender'] == 'Men']
    women = sportart.loc[sportart['Gender'] == 'Women']
    
    print(kategorie)

    y_mean = [women['Wiki_Count'].mean(), men['Wiki_Count'].mean()]
    y_median = [women['Wiki_Count'].median(), men['Wiki_Count'].median()]
    mean_women = y_mean[0]
    mean_men = y_mean[1]
    
    print('%d women, mean: %f, median: %f' % (len(women), mean_women, y_median[0]) )
    print('%d men, mean: %f, median: %f' % (len(men), mean_men, y_median[1]) )  
    
    plt.subplot(1, 2, 1)
    plot_means([mean_women, mean_men])
    plt.subplot(1, 2, 2)
    plot_median([y_median[0], y_median[1]])

    plt.show()
    
    
    
    # conduct t test
    
    return stats.ttest_ind(men['Wiki_Count'], women['Wiki_Count'])

In [None]:
# words,edits,links
print(ttest(Athletes_100m,'words')) 

In [None]:
def intify(stringseries):
    list = []
    for row in stringseries:
        list.append(int(row))
    
    return pd.Series(list)

In [None]:
def createDataFrame(data):
    indices = []
    dataWithoutIndices = []
    for lines in data[1:]:
        indices.append(lines[0])
        dataWithoutIndices.append(lines[1:])

    return pd.DataFrame(data=dataWithoutIndices, columns=data[0][1:])

In [None]:
# Turn all strings that should be int into int
allpeople = createDataFrame(data)
allpeople.Wordcount = intify(allpeople.Wordcount)
allpeople.Editcount = intify(allpeople.Editcount)
allpeople.Linkcount = intify(allpeople.Linkcount)

In [None]:
allmen = allpeople.loc[allpeople['Gender'] == 'm']
allwomen = allpeople.loc[allpeople['Gender'] == 'f']
unassigned = allpeople.loc[allpeople['Gender'] == '0']

print('Women: ' + str(len(allwomen)) + ' Men: ' +str(len(allmen)) +' Unassigned: ' + str(len(unassigned)))

In [None]:
#Extreme values on wordcount column
def extremes_per_sport(data, gender):
    sports = data.Sport.unique()
    cols = ['Max', 'Min']
    max = []
    min = []
    gendered_dataFrame = data.loc[data['Gender'] == gender]
    for sport in sports:
        max.append(gendered_dataFrame.loc[gendered_dataFrame['Sport'] == sport].Wordcount.max())
        min.append(gendered_dataFrame.loc[gendered_dataFrame['Sport'] == sport].Wordcount.min())
    return pd.DataFrame({'Max':np.asarray(max),
                        'Min': np.asarray(min)}, index=sports)

In [None]:
def plot_max(data, gender):
    labels = data.Sport.unique()
    y_pos = np.arange(len(labels))
    performance = np.asarray(extremes_per_sport(data, gender).Max)
 
    plt.barh(y_pos, performance, align='center', alpha=0.5, color='green')
    plt.yticks(y_pos, labels)
    plt.xlabel('Wordcount')
    plt.title('Max Wordcount')
 
    plt.show()

In [None]:
plot_max(allpeople, 'f')

In [None]:
plot_max(allpeople, 'm')

In [None]:
def plot_min(data, gender):
    labels = data.Sport.unique()
    y_pos = np.arange(len(labels))
    performance = np.asarray(extremes_per_sport(data, gender).Min) 
    plt.barh(y_pos, performance, align='center', alpha=0.5, color='green')
    plt.yticks(y_pos, labels)
    plt.xlabel('Wordcount')
    plt.title('Min Wordcount')
 
    plt.show()

In [None]:
plot_min(allpeople, 'f')

In [None]:
plot_min(allpeople, 'm')

In [None]:
# takes a pandas series (one column of a pandas dataFrame)
def histogram(data):
    num_bins = 8
    n, bins, patches = plt.hist(data, num_bins, facecolor='blue', alpha=0.5)
    plt.show()

In [None]:
histogram(allpeople.loc[allpeople['Gender'] == 'm'].Wordcount)

In [None]:
histogram(allpeople.loc[allpeople['Gender'] == 'f'].Wordcount)

In [None]:
histogram(allpeople.loc[allpeople['Gender'] == 'f'].Editcount)

In [None]:
histogram(allpeople.loc[allpeople['Gender'] == 'm'].Editcount)

In [None]:
#Number of ppl of specified gender per sport
def gender_per_sport(data):
    sports = data.Sport.unique()
    female = []
    male = []
    equal_genders = []
    allwomen = data.loc[data['Gender'] == 'f']
    allmen = data.loc[data['Gender'] == 'm']
    for sport in sports:
        amount_of_women = len(allwomen.loc[allwomen['Sport'] == sport])
        amount_of_men = len(allmen.loc[allmen['Sport'] == sport])
        female.append(amount_of_women)
        male.append(amount_of_men)
        if amount_of_men == amount_of_women:
            equal_genders.append('true')
        else:
            equal_genders.append('false')
    return pd.DataFrame({'f':np.asarray(female),
                        'm': np.asarray(male),
                        'equal': np.asarray(equal_genders)}, index=sports)

In [None]:
per_sport = gender_per_sport(allpeople)
#Male and female athletes per sport, where there are equal numbers of participants
per_sport.loc[per_sport['equal'] == 'true']