In [1]:
import pandas as pd
import statsmodels as sm
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import os

<hr>
<hr>
<h1># 02 - Ergebnisdaten: Harmonisierung der geparsten Daten:</h1>
<h2>1. Berlin<h2>
<h2>2. Chicago</h2>
<h2>3. London</h2>
<h2>4. New York</h2>
<h2>5. Tokyo</h2>
<hr>
<hr>
<p><b>Hinweis:</b> Dateifpade sind absolut angegeben und müssen entsprechend der eigenen Verzeichnisstruktur angepasst werden! <br>
Beim angeben des Hauptverzeichnisses bitte sicherstellen das folgende Unterverzeichnisse angelgt sind:
<lu>
<li>/Berlin/</li>
<li>/Chicago/</li>
<li>/London/</li>
<li>/NYC/</li>
<li>/Tokyo/</li>
<li>/wmm_data/</li>
</lu>
<br>
Da sonst eine Anpassung im Code notwendig wird!
</p>
<hr>
<p><b>ACHTUNG: Unbedingt vor dem Ausführen beachten:</b> <br>
Die Namen der Athletinnen und Athleten des Tokyo-Wettbewerbs müssen vorher in den geparsten Dateien aus dem japanischen (händisch) übersetzt werden! <br>
Ansonsten bricht das Skript ab!</p>
<hr>
<hr>

In [3]:
# Hauptverzeichnis setzen
main_data_path = '/home/paul/python_projects/masterthesis/data/'
# Das Hauptverzeichnis muss folgende Unterverzeichnisse mit den Daten enthalten (ansonsten müssen diese im Code angepasst werden):
# /Berlin/
# /Chicago/
# /London/
# /NYC/
# /Tokyo/
# /wmm_data/

pd.set_option('display.max_columns', 1000)

<h1>1. Berlin-Marathon:</h1>
<hr>

In [3]:
# Daten einlesen
df_berlin_data_csv = pd.read_csv(filepath_or_buffer=main_data_path +'Berlin/daten_wmm_berlin_all.csv',
    header=0, sep=';',
    decimal='.',
    names=['Jahr', 'Platz','Vorname', 'Nachname', 'NAT', 'SN', 'Verein','Geschlecht','Netto', 'Brutto','5KM','10KM','15KM','20KM','HM','25KM','30KM','35KM','40KM']
)

df_berlin_data_csv.head()

Unnamed: 0,Jahr,Platz,Vorname,Nachname,NAT,SN,Verein,Geschlecht,Netto,Brutto,5KM,10KM,15KM,20KM,HM,25KM,30KM,35KM,40KM
0,2019,1,Kenenisa,Bekele,ETH,2,Äthiopien,M,02:01:41,02:01:41,00:14:24,00:28:53,00:43:29,00:57:58,01:01:05,01:12:30,01:26:55,01:41:15,01:55:30
1,2019,2,Birhanu,Legese,ETH,5,Äthiopien,M,02:02:48,02:02:48,00:14:25,00:28:53,00:43:29,00:57:58,01:01:05,01:12:30,01:26:53,01:41:02,01:56:00
2,2019,3,Sisay,Lemma,ETH,4,Äthiopien,M,02:03:36,02:03:36,00:14:25,00:28:52,00:43:29,00:57:58,01:01:06,01:12:31,01:26:53,01:41:16,01:56:54
3,2019,4,Jonathan,Korir,KEN,7,Kenia,M,02:06:45,02:06:45,00:14:25,00:28:53,00:43:30,00:57:58,01:01:06,01:12:31,01:27:42,01:43:39,01:59:49
4,2019,5,Felix,Kandie,KEN,6,Kenia,M,02:08:07,02:08:07,00:14:43,00:29:30,00:44:19,00:59:06,01:02:20,01:14:04,01:28:58,01:44:10,02:00:27


In [4]:
# Nicht benötigte Spalten entfernen
df_berlin_data = df_berlin_data_csv.drop(columns=['NAT','SN','Verein','Brutto'])

# Spalten hinzufügen
df_berlin_data['Datum'] = None
df_berlin_data['Startzeit'] = None
df_berlin_data['Ort'] = 'Berlin'

# Spalten umbenennen
df_berlin_data = df_berlin_data.rename(
    columns={'Netto':'T_KM_FN','5KM':'T_KM_5','10KM':'T_KM_10','15KM':'T_KM_15','20KM':'T_KM_20','HM':'T_KM_HM','25KM':'T_KM_25','30KM':'T_KM_30','35KM':'T_KM_35','40KM':'T_KM_40'})

# Spalten umsortieren
df_berlin_data = df_berlin_data[['Jahr','Ort','Geschlecht','Vorname','Nachname','Platz','Datum','Startzeit','T_KM_FN','T_KM_5','T_KM_10','T_KM_15','T_KM_20','T_KM_HM','T_KM_25','T_KM_30','T_KM_35','T_KM_40']]
df_berlin_data.head()

Unnamed: 0,Jahr,Ort,Geschlecht,Vorname,Nachname,Platz,Datum,Startzeit,T_KM_FN,T_KM_5,T_KM_10,T_KM_15,T_KM_20,T_KM_HM,T_KM_25,T_KM_30,T_KM_35,T_KM_40
0,2019,Berlin,M,Kenenisa,Bekele,1,,,02:01:41,00:14:24,00:28:53,00:43:29,00:57:58,01:01:05,01:12:30,01:26:55,01:41:15,01:55:30
1,2019,Berlin,M,Birhanu,Legese,2,,,02:02:48,00:14:25,00:28:53,00:43:29,00:57:58,01:01:05,01:12:30,01:26:53,01:41:02,01:56:00
2,2019,Berlin,M,Sisay,Lemma,3,,,02:03:36,00:14:25,00:28:52,00:43:29,00:57:58,01:01:06,01:12:31,01:26:53,01:41:16,01:56:54
3,2019,Berlin,M,Jonathan,Korir,4,,,02:06:45,00:14:25,00:28:53,00:43:30,00:57:58,01:01:06,01:12:31,01:27:42,01:43:39,01:59:49
4,2019,Berlin,M,Felix,Kandie,5,,,02:08:07,00:14:43,00:29:30,00:44:19,00:59:06,01:02:20,01:14:04,01:28:58,01:44:10,02:00:27


<hr>
<h1>2. Chicago-Marathon:</h1>
<hr>

In [10]:

# Temporäres Dateframe erzeugen
df_chicago_data_csv = pd.DataFrame(pd.DataFrame(
    columns=[
        'Jahr','Ges', 'Name', 'Age_Group', 'Runner_Number', 'Place_Gender', 'Place_Age_Group','Place_Overall','Start_Time_of_day','Finish_Time','5K_Time','5K_Diff','5K_min_km','5K_kmh'
        ,'10K_Time','10K_Diff','10K_min_km','10K_kmh','15K_Time','15K_Diff','15K_min_km','15K_kmh','20K_Time','20K_Diff','20K_min_km','20K_kmh','HK_Time','HK_Diff','HK_min_km','HK_kmh'
        ,'25K_Time','25K_Diff','25K_min_km','25K_kmh','30K_Time','30K_Diff','30K_min_km','30K_kmh','35K_Time','35K_Diff','35K_min_km','35K_kmh','40K_Time','40K_Diff','40K_min_km','40K_kmh'
        ,'FK_Time','FK_Diff','FK_min_km','FK_kmh'
        ])
    ,dtype=str)

# Daten einlesen
for year in range(2007,2020):
        file_path = main_data_path + 'Chicago/daten_wmm_chicago_' + str(year) + '.csv'
        df_chicago_data_csv_temp = pd.read_csv(filepath_or_buffer=file_path,
                header=0, sep=';',decimal='.'
                ,names=[
                'Jahr','Ges', 'Name', 'Age_Group', 'Runner_Number', 'Place_Gender', 'Place_Age_Group','Place_Overall','Start_Time_of_day','Finish_Time','5K_Time','5K_Diff','5K_min_km','5K_kmh'
                ,'10K_Time','10K_Diff','10K_min_km','10K_kmh','15K_Time','15K_Diff','15K_min_km','15K_kmh','20K_Time','20K_Diff','20K_min_km','20K_kmh','HK_Time','HK_Diff','HK_min_km','HK_kmh'
                ,'25K_Time','25K_Diff','25K_min_km','25K_kmh','30K_Time','30K_Diff','30K_min_km','30K_kmh','35K_Time','35K_Diff','35K_min_km','35K_kmh','40K_Time','40K_Diff','40K_min_km','40K_kmh'
                ,'FK_Time','FK_Diff','FK_min_km','FK_kmh'
                ]
        )
        df_chicago_data_csv = df_chicago_data_csv.append(df_chicago_data_csv_temp, ignore_index=True)

df_chicago_data_csv.head(50)

Unnamed: 0,Jahr,Ges,Name,Age_Group,Runner_Number,Place_Gender,Place_Age_Group,Place_Overall,Start_Time_of_day,Finish_Time,5K_Time,5K_Diff,5K_min_km,5K_kmh,10K_Time,10K_Diff,10K_min_km,10K_kmh,15K_Time,15K_Diff,15K_min_km,15K_kmh,20K_Time,20K_Diff,20K_min_km,20K_kmh,HK_Time,HK_Diff,HK_min_km,HK_kmh,25K_Time,25K_Diff,25K_min_km,25K_kmh,30K_Time,30K_Diff,30K_min_km,30K_kmh,35K_Time,35K_Diff,35K_min_km,35K_kmh,40K_Time,40K_Diff,40K_min_km,40K_kmh,FK_Time,FK_Diff,FK_min_km,FK_kmh
0,2007,M,"Ivuti, Patrick (KEN)",25-29,8,1,1,1,–,02:11:11,00:15:44,15:44,03:09,19.07,00:31:32,15:48,03:10,18.99,00:47:17,15:45,03:09,19.05,01:02:39,15:22,03:05,19.52,01:05:52,03:13,02:56,20.46,01:17:48,11:56,03:04,19.62,01:33:05,15:17,03:04,19.63,01:49:06,16:01,03:13,18.73,02:04:36,15:30,03:06,19.35,02:11:11,06:35,03:00,20.01
1,2007,M,"Gharib, Jaouad (MAR)",35-39,7,2,1,2,–,02:11:11,00:15:44,15:44,03:09,19.07,00:31:31,15:47,03:10,19.01,00:47:17,15:46,03:10,19.03,01:02:39,15:22,03:05,19.52,01:05:53,03:14,02:57,20.36,01:17:48,11:55,03:04,19.65,01:33:05,15:17,03:04,19.63,01:49:06,16:01,03:13,18.73,02:04:37,15:31,03:07,19.33,02:11:11,06:34,03:00,20.06
2,2007,M,"Njenga, Daniel (KEN)",30-34,3,3,1,3,–,02:12:45,00:15:44,15:44,03:09,19.07,00:31:32,15:48,03:10,18.99,00:47:18,15:46,03:10,19.03,01:02:40,15:22,03:05,19.52,01:05:53,03:13,02:56,20.46,01:17:48,11:55,03:04,19.65,01:33:05,15:17,03:04,19.63,01:49:06,16:01,03:13,18.73,02:05:18,16:12,03:15,18.52,02:12:45,07:27,03:24,17.68
3,2007,M,"Cheruiyot, Robert K. (KEN)",25-29,1,4,2,4,–,02:16:13,00:15:44,15:44,03:09,19.07,00:31:31,15:47,03:10,19.01,00:47:16,15:45,03:09,19.05,01:02:39,15:23,03:05,19.5,01:05:52,03:13,02:56,20.46,01:17:47,11:55,03:04,19.65,01:33:05,15:18,03:04,19.61,01:49:06,16:01,03:13,18.73,02:07:50,18:44,03:45,16.01,02:16:13,08:23,03:50,15.71
4,2007,M,"Maiyo, Benjamin (KEN)",25-29,4,5,3,5,–,02:16:59,00:15:44,15:44,03:09,19.07,00:31:31,15:47,03:10,19.01,00:47:16,15:45,03:09,19.05,01:02:39,15:23,03:05,19.5,01:05:53,03:14,02:57,20.36,01:17:48,11:55,03:04,19.65,01:33:27,15:39,03:08,19.17,01:50:36,17:09,03:26,17.49,02:08:58,18:22,03:41,16.33,02:16:59,08:01,03:40,16.43
5,2007,M,"Cheboiboch, Christopher (KEN)",30-34,12,6,2,6,–,02:17:17,00:15:44,15:44,03:09,19.07,00:31:34,15:50,03:10,18.95,00:47:18,15:44,03:09,19.07,01:02:55,15:37,03:08,19.21,01:06:24,03:29,03:11,18.9,01:18:59,12:35,03:14,18.61,01:35:40,16:41,03:21,17.98,01:52:48,17:08,03:26,17.51,02:09:47,16:59,03:24,17.66,02:17:17,07:30,03:26,17.56
6,2007,M,"Lee, Bong ju (KOR)",35-39,10,7,2,7,–,02:17:29,00:15:44,15:44,03:09,19.07,00:31:31,15:47,03:10,19.01,00:47:17,15:46,03:10,19.03,01:02:59,15:42,03:09,19.11,01:06:31,03:32,03:14,18.63,01:19:14,12:43,03:16,18.42,01:35:47,16:33,03:19,18.13,01:52:50,17:03,03:25,17.6,02:09:57,17:07,03:26,17.53,02:17:29,07:32,03:26,17.48
7,2007,M,"Cox, Michael (USA)",30-34,330,8,3,8,–,02:21:42,00:16:33,16:33,03:19,18.13,00:32:57,16:24,03:17,18.29,00:49:32,16:35,03:19,18.09,01:06:09,16:37,03:20,18.05,01:09:48,03:39,03:20,18.03,01:22:42,12:54,03:19,18.15,01:39:20,16:38,03:20,18.04,01:56:35,17:15,03:27,17.39,02:14:08,17:33,03:31,17.09,02:21:42,07:34,03:27,17.41
8,2007,M,"Flogel, Jason (USA)",20-24,343,9,1,9,–,02:26:34,00:16:51,16:51,03:23,17.8,00:33:26,16:35,03:19,18.09,00:50:17,16:51,03:23,17.8,01:07:16,16:59,03:24,17.66,01:11:03,03:47,03:27,17.4,01:24:24,13:21,03:26,17.54,01:41:53,17:29,03:30,17.16,02:00:05,18:12,03:39,16.48,02:18:28,18:23,03:41,16.32,02:26:34,08:06,03:42,16.26
9,2007,M,"Blake, Eric (USA)",25-29,91,10,4,10,–,02:26:55,00:17:04,17:04,03:25,17.58,00:33:39,16:35,03:19,18.09,00:50:31,16:52,03:23,17.79,01:07:31,17:00,03:24,17.65,01:11:17,03:46,03:27,17.47,01:24:50,13:33,03:29,17.28,01:42:37,17:47,03:34,16.87,02:00:57,18:20,03:40,16.36,02:19:08,18:11,03:39,16.5,02:26:55,07:47,03:33,16.92


In [6]:
# Nicht benötigte Spalten entfernen
df_chicago_data = df_chicago_data_csv.drop(
    columns=['Age_Group','Runner_Number','Place_Age_Group','Finish_Time','Place_Overall','5K_Diff','5K_min_km','5K_kmh','10K_Diff','10K_min_km','10K_kmh'
            ,'15K_Diff','15K_min_km','15K_kmh','20K_Diff','20K_min_km','20K_kmh','HK_Diff','HK_min_km','HK_kmh','25K_Diff','25K_min_km','25K_kmh'
            ,'30K_Diff','30K_min_km','30K_kmh','35K_Diff','35K_min_km','35K_kmh','40K_Diff','40K_min_km','40K_kmh','FK_Diff','FK_min_km','FK_kmh'
    ])

# Spalten hinzufügen
df_chicago_data['Datum'] = None
df_chicago_data['Nachname'] = None
df_chicago_data['Ort'] = 'Chicago'

# Spalten umbenennen
df_chicago_data = df_chicago_data.rename(
    columns={'Ges':'Geschlecht','Place_Gender':'Platz','FK_Time':'T_KM_FN','Name':'Vorname','Start_Time_of_day' : 'Startzeit',
    '5K_Time':'T_KM_5','10K_Time':'T_KM_10','15K_Time':'T_KM_15','20K_Time':'T_KM_20','HK_Time':'T_KM_HM','25K_Time':'T_KM_25','30K_Time':'T_KM_30','35K_Time':'T_KM_35','40K_Time':'T_KM_40'})

# Spalten umsortieren
df_chicago_data = df_chicago_data[['Jahr','Ort','Geschlecht','Vorname','Nachname','Platz','Datum','Startzeit','T_KM_FN','T_KM_5','T_KM_10','T_KM_15','T_KM_20','T_KM_HM','T_KM_25','T_KM_30','T_KM_35','T_KM_40']]

# Inhalt aus der ursprünglichen Spalte Name in Vorname und Nachname trennen
for index, row in df_chicago_data.iterrows():
    #print(row['Vorname'])
    name_split_1 = row['Vorname'].split(',')
    name_split_2 = name_split_1[1].split(' ')
    name_split_2.pop(0)
    name_split_2.pop(len(name_split_2)-1)
    name_split_2 = ' '.join(name_split_2)
    row['Vorname'] = name_split_2
    row['Nachname'] = name_split_1[0]

# Variable auf "none" setzen für spätere Verwendung
name_split_1 = None
name_split_2 = None

df_chicago_data.head()

Unnamed: 0,Jahr,Ort,Geschlecht,Vorname,Nachname,Platz,Datum,Startzeit,T_KM_FN,T_KM_5,T_KM_10,T_KM_15,T_KM_20,T_KM_HM,T_KM_25,T_KM_30,T_KM_35,T_KM_40
0,2007,Chicago,M,Patrick,Ivuti,1,,–,02:11:11,00:15:44,00:31:32,00:47:17,01:02:39,01:05:52,01:17:48,01:33:05,01:49:06,02:04:36
1,2007,Chicago,M,Jaouad,Gharib,2,,–,02:11:11,00:15:44,00:31:31,00:47:17,01:02:39,01:05:53,01:17:48,01:33:05,01:49:06,02:04:37
2,2007,Chicago,M,Daniel,Njenga,3,,–,02:12:45,00:15:44,00:31:32,00:47:18,01:02:40,01:05:53,01:17:48,01:33:05,01:49:06,02:05:18
3,2007,Chicago,M,Robert K.,Cheruiyot,4,,–,02:16:13,00:15:44,00:31:31,00:47:16,01:02:39,01:05:52,01:17:47,01:33:05,01:49:06,02:07:50
4,2007,Chicago,M,Benjamin,Maiyo,5,,–,02:16:59,00:15:44,00:31:31,00:47:16,01:02:39,01:05:53,01:17:48,01:33:27,01:50:36,02:08:58


<hr>
<h1>3. London-Marathon:</h1>
<hr>

In [7]:
# Temporäres Dateframe erzeugen
df_london_data_csv = pd.DataFrame(pd.DataFrame(
    columns=[
        'Jahr','Ges', 'Name','Club', 'Category', 'Runner_Number', 'Place_Gender', 'Place_Category','Place_Overall','Start_Time_of_day','Finish_Time', 'Race_Status','Last_Split'
        ,'5K_Time_of_day','5K_Time','5K_Diff','5K_min_km','5K_kmh','10K_Time_of_day','10K_Time','10K_Diff','10K_min_km','10K_kmh','15K_Time_of_day','15K_Time','15K_Diff','15K_min_km','15K_kmh'
        ,'20K_Time_of_day','20K_Time','20K_Diff','20K_min_km','20K_kmh','HK_Time_of_day','HK_Time','HK_Diff','HK_min_km','HK_kmh','25K_Time_of_day','25K_Time','25K_Diff','25K_min_km','25K_kmh'
        ,'30K_Time_of_day','30K_Time','30K_Diff','30K_min_km','30K_kmh','35K_Time_of_day','35K_Time','35K_Diff','35K_min_km','35K_kmh','40K_Time_of_day','40K_Time','40K_Diff','40K_min_km','40K_kmh'
        ,'FK_Time_of_day','FK_Time','FK_Diff','FK_min_km','FK_kmh'
        ])
    ,dtype=str)

# Daten einlesen
for year in range(2010,2020):
        file_path = main_data_path + 'London/daten_wmm_london_' + str(year) + '.csv'
        df_london_data_csv_temp = pd.read_csv(filepath_or_buffer=file_path,
                header=0, sep=';',decimal='.'
                ,names=[
                'Jahr','Ges', 'Name','Club', 'Category', 'Runner_Number', 'Place_Gender', 'Place_Category','Place_Overall','Start_Time_of_day','Finish_Time', 'Race_Status','Last_Split'
                ,'5K_Time_of_day','5K_Time','5K_Diff','5K_min_km','5K_kmh','10K_Time_of_day','10K_Time','10K_Diff','10K_min_km','10K_kmh','15K_Time_of_day','15K_Time','15K_Diff','15K_min_km','15K_kmh'
                ,'20K_Time_of_day','20K_Time','20K_Diff','20K_min_km','20K_kmh','HK_Time_of_day','HK_Time','HK_Diff','HK_min_km','HK_kmh','25K_Time_of_day','25K_Time','25K_Diff','25K_min_km','25K_kmh'
                ,'30K_Time_of_day','30K_Time','30K_Diff','30K_min_km','30K_kmh','35K_Time_of_day','35K_Time','35K_Diff','35K_min_km','35K_kmh','40K_Time_of_day','40K_Time','40K_Diff','40K_min_km','40K_kmh'
                ,'FK_Time_of_day','FK_Time','FK_Diff','FK_min_km','FK_kmh'
                ]
        )
        df_london_data_csv = df_london_data_csv.append(df_london_data_csv_temp, ignore_index=True)

df_london_data_csv.head()

Unnamed: 0,Jahr,Ges,Name,Club,Category,Runner_Number,Place_Gender,Place_Category,Place_Overall,Start_Time_of_day,...,40K_Time_of_day,40K_Time,40K_Diff,40K_min_km,40K_kmh,FK_Time_of_day,FK_Time,FK_Diff,FK_min_km,FK_kmh
0,2010,M,"Kebede, Tsegaye (ETH)",,18-39,5,1,1,1,09:45:00,...,,01:58:41,,,,,,,,
1,2010,M,"Mutai, Emmanuel (KEN)",,18-39,7,2,2,2,09:45:00,...,,01:59:39,,,,,,,,
2,2010,M,"Gharib, Jaouad (MAR)",,18-39,6,3,3,3,09:45:00,...,,02:00:15,,,,,,,,
3,2010,M,"Bouramdane, Abderrahime (MAR)",,18-39,9,4,4,4,09:45:00,...,,02:00:36,,,,,,,,
4,2010,M,"Kirui, Abel (KEN)",,18-39,3,5,5,5,09:45:00,...,,01:59:46,,,,,,,,


In [8]:
# Nicht benötigte Spalten entfernen
df_london_data = df_london_data_csv.drop(
    columns=['Club', 'Category', 'Runner_Number','Place_Category','Place_Overall','Race_Status','Last_Split','5K_Time_of_day','5K_Diff','5K_min_km','5K_kmh','10K_Time_of_day','10K_Diff','10K_min_km','10K_kmh'
                ,'15K_Time_of_day','15K_Diff','15K_min_km','15K_kmh','20K_Time_of_day','20K_Diff','20K_min_km','20K_kmh','HK_Time_of_day','HK_Diff','HK_min_km','HK_kmh','25K_Time_of_day','25K_Diff','25K_min_km','25K_kmh'
                ,'30K_Time_of_day','30K_Diff','30K_min_km','30K_kmh','35K_Time_of_day','35K_Diff','35K_min_km','35K_kmh','40K_Time_of_day','40K_Diff','40K_min_km','40K_kmh','FK_Time_of_day','FK_Time','FK_Diff','FK_min_km','FK_kmh'
    ])

# Spalten hinzufügen
df_london_data['Datum'] = None
df_london_data['Nachname'] = None
df_london_data['Ort'] = 'London'

# Spalten umbenennen
df_london_data = df_london_data.rename(
    columns={'Ges':'Geschlecht','Place_Gender':'Platz','Finish_Time':'T_KM_FN','Name':'Vorname','Start_Time_of_day':'Startzeit',
    '5K_Time':'T_KM_5','10K_Time':'T_KM_10','15K_Time':'T_KM_15','20K_Time':'T_KM_20','HK_Time':'T_KM_HM','25K_Time':'T_KM_25','30K_Time':'T_KM_30','35K_Time':'T_KM_35','40K_Time':'T_KM_40'})

# Spalten umsortieren
df_london_data = df_london_data[['Jahr','Ort','Geschlecht','Vorname','Nachname','Platz','Datum','Startzeit','T_KM_FN','T_KM_5','T_KM_10','T_KM_15','T_KM_20','T_KM_HM','T_KM_25','T_KM_30','T_KM_35','T_KM_40']]

# Inhalt aus der ursprünglichen Spalte Name in Vorname und Nachname trennen
for index, row in df_london_data.iterrows():
    #print(row['Vorname'])
    name_split_1 = row['Vorname'].split(',')
    name_split_2 = name_split_1[1].split(' ')
    name_split_2.pop(0)
    name_split_2.pop(len(name_split_2)-1)
    name_split_2 = ' '.join(name_split_2)
    row['Vorname'] = name_split_2
    row['Nachname'] = name_split_1[0]

# Variable auf "none" setzen für spätere Verwendung
name_split_1 = None
name_split_2 = None

df_london_data.head()

Unnamed: 0,Jahr,Ort,Geschlecht,Vorname,Nachname,Platz,Datum,Startzeit,T_KM_FN,T_KM_5,T_KM_10,T_KM_15,T_KM_20,T_KM_HM,T_KM_25,T_KM_30,T_KM_35,T_KM_40
0,2010,London,M,Tsegaye,Kebede,1,,09:45:00,02:05:19,00:14:40,00:29:42,00:44:51,00:59:54,01:03:07,01:14:19,01:28:46,01:43:30,01:58:41
1,2010,London,M,Emmanuel,Mutai,2,,09:45:00,02:06:23,00:14:40,00:29:42,00:44:51,00:59:54,01:03:06,01:14:20,01:28:52,01:44:05,01:59:39
2,2010,London,M,Jaouad,Gharib,3,,09:45:00,02:06:55,00:14:40,00:29:42,00:44:52,00:59:54,01:03:07,01:14:21,01:29:01,01:44:25,02:00:15
3,2010,London,M,Abderrahime,Bouramdane,4,,09:45:00,02:07:33,00:14:40,00:29:42,00:44:52,00:59:54,01:03:07,01:14:20,01:29:00,01:44:26,02:00:36
4,2010,London,M,Abel,Kirui,5,,09:45:00,02:08:04,00:14:40,00:29:42,00:44:51,00:59:53,01:03:06,01:14:19,01:28:46,01:43:37,01:59:46


<hr>
<h1>4. New York-Marathon:</h1>
<hr>

In [9]:
# Temporäres Dateframe erzeugen
df_newyork_data_csv = pd.DataFrame(pd.DataFrame(
    columns=[
        'Jahr','Ges','Start_Time_of_day','Name','Runner_Number','FK_Time','Pace_per_Mile','Place_Overall','Place_Gender','Age_Group','Place_Age_Group','Place_Age_Graded','Time_Age_Graded'
        ,'Gun_Time','5K_Time','10K_Time','15K_Time','20K_Time','HK_Time','25K_Time','30K_Time','35K_Time','40K_Time'
        ])
    ,dtype=str)

# Daten einlesen (Im Jahr 2012 fand kein Marathon statt)
for year in [2007,2008,2009,2010,2011,2013,2014,2015,2016,2017,2018,2019]:
        file_path = main_data_path + 'NYC/daten_wmm_nyc_' + str(year) + '.csv'
        df_newyork_data_csv_temp = pd.read_csv(filepath_or_buffer=file_path,
                header=0, sep=';',decimal='.'
                ,names=[
                'Jahr','Ges','Start_Time_of_day','Name','Runner_Number','FK_Time','Pace_per_Mile','Place_Overall','Place_Gender','Age_Group','Place_Age_Group','Place_Age_Graded','Time_Age_Graded'
                ,'Gun_Time','5K_Time','10K_Time','15K_Time','20K_Time','HK_Time','25K_Time','30K_Time','35K_Time','40K_Time'
                ]
        )
        df_newyork_data_csv = df_newyork_data_csv.append(df_newyork_data_csv_temp, ignore_index=True)

df_newyork_data_csv.head()

Unnamed: 0,Jahr,Ges,Start_Time_of_day,Name,Runner_Number,FK_Time,Pace_per_Mile,Place_Overall,Place_Gender,Age_Group,...,Gun_Time,5K_Time,10K_Time,15K_Time,20K_Time,HK_Time,25K_Time,30K_Time,35K_Time,40K_Time
0,2007,M,9:00AM,Martin Lel,3,2:09:04,04:56,1,1,,...,2:09:04,0:15:48,0:31:26,0:46:13,1:02:16,1:05:45,1:17:46,1:32:20,1:47:34,2:02:44
1,2007,M,9:00AM,Abderrahim Goumri,4,2:09:16,04:56,2,2,,...,2:09:16,0:15:48,0:31:27,0:46:14,1:02:17,1:05:46,1:17:46,1:32:21,1:47:34,2:02:44
2,2007,M,9:00AM,Hendrick Ramaala,9,2:11:25,05:01,3,3,,...,2:11:25,0:15:48,0:31:23,0:46:14,1:02:19,1:05:46,1:17:47,1:32:21,1:47:35,2:03:50
3,2007,M,9:00AM,Stefano Baldini,6,2:11:58,05:02,4,4,,...,2:11:58,0:15:48,0:31:30,0:46:46,1:02:19,1:05:47,1:17:47,1:33:00,1:49:01,2:05:05
4,2007,M,9:00AM,James Kwambai,11,2:12:25,05:03,5,5,,...,2:12:25,0:15:49,0:31:26,0:46:14,1:02:17,1:05:46,1:17:47,1:32:20,1:47:34,2:04:22


In [10]:
# Nicht benötigte Spalten entfernen
df_newyork_data = df_newyork_data_csv.drop(
    columns=['Runner_Number','Pace_per_Mile','Place_Overall','Age_Group','Place_Age_Group','Place_Age_Graded','Time_Age_Graded','Gun_Time'
    ])

# Spalten hinzufügen
df_newyork_data['Datum'] = None
df_newyork_data['Nachname'] = None
df_newyork_data['Ort'] = 'NewYork'

# Spalten umbenennen
df_newyork_data = df_newyork_data.rename(
    columns={'Ges':'Geschlecht','Place_Gender':'Platz','FK_Time':'T_KM_FN','Name':'Vorname','Start_Time_of_day':'Startzeit',
    '5K_Time':'T_KM_5','10K_Time':'T_KM_10','15K_Time':'T_KM_15','20K_Time':'T_KM_20','HK_Time':'T_KM_HM','25K_Time':'T_KM_25','30K_Time':'T_KM_30','35K_Time':'T_KM_35','40K_Time':'T_KM_40'})

# Uhrzeit korrigieren: 2:00PM zu 12:00PM (Fehler ist beim abziehen der Daten entstanden)
df_newyork_data['Startzeit'].mask(df_newyork_data['Jahr'] == 2018,'12:00PM',inplace=True)

# Spalten umsortieren
df_newyork_data = df_newyork_data[['Jahr','Ort','Geschlecht','Vorname','Nachname','Platz','Datum','Startzeit','T_KM_FN','T_KM_5','T_KM_10','T_KM_15','T_KM_20','T_KM_HM','T_KM_25','T_KM_30','T_KM_35','T_KM_40']]

# Inhalt aus der ursprünglichen Spalte Name in Vorname und Nachname trennen
for index, row in df_newyork_data.iterrows():
    #print(row['Vorname'])
    name_split = row['Vorname'].split(' ')
    name_vname = name_split[0]
    name_split.pop(0)
    name_nname = ' '.join(name_split)
    row['Vorname'] = name_vname
    row['Nachname'] = name_nname
    #start_zeit = row['Startzeit']
    #row['Startzeit'] = start_zeit[:-2]

# Variable auf "none" setzen für spätere Verwendung
name_split = None

df_newyork_data.head()

Unnamed: 0,Jahr,Ort,Geschlecht,Vorname,Nachname,Platz,Datum,Startzeit,T_KM_FN,T_KM_5,T_KM_10,T_KM_15,T_KM_20,T_KM_HM,T_KM_25,T_KM_30,T_KM_35,T_KM_40
0,2007,NewYork,M,Martin,Lel,1,,9:00AM,2:09:04,0:15:48,0:31:26,0:46:13,1:02:16,1:05:45,1:17:46,1:32:20,1:47:34,2:02:44
1,2007,NewYork,M,Abderrahim,Goumri,2,,9:00AM,2:09:16,0:15:48,0:31:27,0:46:14,1:02:17,1:05:46,1:17:46,1:32:21,1:47:34,2:02:44
2,2007,NewYork,M,Hendrick,Ramaala,3,,9:00AM,2:11:25,0:15:48,0:31:23,0:46:14,1:02:19,1:05:46,1:17:47,1:32:21,1:47:35,2:03:50
3,2007,NewYork,M,Stefano,Baldini,4,,9:00AM,2:11:58,0:15:48,0:31:30,0:46:46,1:02:19,1:05:47,1:17:47,1:33:00,1:49:01,2:05:05
4,2007,NewYork,M,James,Kwambai,5,,9:00AM,2:12:25,0:15:49,0:31:26,0:46:14,1:02:17,1:05:46,1:17:47,1:32:20,1:47:34,2:04:22


<hr>
<h1>5. Tokyo-Marathon:</h1>
<hr>

In [11]:
# Temporäres Dateframe erzeugen (für die Jahre 2007 bis 2014)

df_tokyo_data_csv_1 = pd.DataFrame(pd.DataFrame(
    columns=[
        'Jahr','Ges','Name','Number','Time','Time_Chip ','Place','5K_Time','10K_Time','15K_Time','20K_Time','HK_Time','25K_Time','30K_Time','35K_Time','40K_Time','FK_Time'
        ])
    ,dtype=str)

# Temporäres Dateframe erzeugen (für die Jahre 2015 bis 2019)
df_tokyo_data_csv_2 = pd.DataFrame(pd.DataFrame(
    columns=[
        'Jahr','Ges','Place_Overall','Number_Card','Name','Age_Group','Place_Age_Group','Place_Gender','Time_net','Time_gross','5K_Time','10K_Time','15K_Time','20K_Time'
        ,'HK_Time','25K_Time','30K_Time','35K_Time','40K_Time','FK_Time'
        ])
    ,dtype=str)

# Daten einlesen (2007 bis 2014)
for year in range(2007,2015):
        file_path = main_data_path + 'Tokyo/daten_wmm_tokyo_' + str(year) + '.csv'
        df_tokyo_data_csv_temp_1 = pd.read_csv(filepath_or_buffer=file_path,
                header=0, sep=';',decimal='.'
                ,names=[
                    'Jahr','Ges','Name','Number','Time','Time_Chip ','Place','5K_Time','10K_Time','15K_Time','20K_Time','HK_Time','25K_Time','30K_Time','35K_Time','40K_Time','FK_Time'
                ]
        )
        df_tokyo_data_csv_1 = df_tokyo_data_csv_1.append(df_tokyo_data_csv_temp_1, ignore_index=True)

# Daten einlesen (2015 bis 2019)
for year in range(2015,2020):
        file_path = main_data_path + 'Tokyo/daten_wmm_tokyo_' + str(year) + '.csv'
        df_tokyo_data_csv_temp_2 = pd.read_csv(filepath_or_buffer=file_path,
                header=0, sep=';',decimal='.'
                ,names=[
                    'Jahr','Ges','Place_Overall','Number_Card','Name','Age_Group','Place_Age_Group','Place_Gender','Time_net','Time_gross','5K_Time','10K_Time','15K_Time'
                    ,'20K_Time','HK_Time','25K_Time','30K_Time','35K_Time','40K_Time','FK_Time'
                ]
        )
        df_tokyo_data_csv_2 = df_tokyo_data_csv_2.append(df_tokyo_data_csv_temp_2, ignore_index=True)


In [12]:
df_tokyo_data_csv_1.head()

Unnamed: 0,Jahr,Ges,Name,Number,Time,Time_Chip,Place,5K_Time,10K_Time,15K_Time,20K_Time,HK_Time,25K_Time,30K_Time,35K_Time,40K_Time,FK_Time
0,2007,M,Daniel Njenga,2,2:09:45,2:09:45,1,0:15:05,0:30:18,0:45:49,1:01:21,1:04:50,1:17:13,1:32:04,1:47:11,2:02:43,2:09:45
1,2007,M,Tomoyuki Sato,34,2:11:22,2:11:22,2,0:15:06,0:30:19,0:45:51,1:01:22,1:04:51,1:17:15,1:32:39,1:48:32,2:04:22,2:11:22
2,2007,M,Satoshi Irifune,32,2:12:44,2:12:43,3,0:15:05,0:30:18,0:45:50,1:01:21,1:04:50,1:17:15,1:32:39,1:48:32,2:05:05,2:12:44
3,2007,M,Masashi Hayashi,101,2:15:28,2:15:28,4,0:15:06,0:30:20,0:45:51,1:01:22,1:04:50,1:17:15,1:32:50,1:49:16,2:06:55,2:15:28
4,2007,M,Kazuyoshi Tokumoto,178,2:15:55,2:15:54,5,0:15:05,0:30:18,0:45:51,1:01:21,1:04:50,1:17:14,1:32:39,1:48:45,2:07:04,2:15:55


In [13]:
# Nicht benötigte Spalten entfernen (Jahr 2007 bis 2014)
df_tokyo_data_1 = df_tokyo_data_csv_1.drop(
    columns=['Time', 'Time_Chip '
    ])

# Spalten hinzufügen (Jahr 2007 bis 2014)
df_tokyo_data_1['Datum'] = None
df_tokyo_data_1['Startzeit'] = None
df_tokyo_data_1['Ort'] = 'Tokyo'

# Spalten umbenennen (Jahr 2007 bis 2014)
df_tokyo_data_1 = df_tokyo_data_1.rename(
    columns={'Ges':'Geschlecht','Place':'Platz','FK_Time':'T_KM_FN','Name':'Vorname','Number':'Nachname','FK_Time':'T_KM_FN',
    '5K_Time':'T_KM_5','10K_Time':'T_KM_10','15K_Time':'T_KM_15','20K_Time':'T_KM_20','HK_Time':'T_KM_HM','25K_Time':'T_KM_25','30K_Time':'T_KM_30','35K_Time':'T_KM_35','40K_Time':'T_KM_40'})

# Spalten umsortieren (Jahr 2007 bis 2014)
df_tokyo_data_1 = df_tokyo_data_1[['Jahr','Ort','Geschlecht','Vorname','Nachname','Platz','Datum','Startzeit','T_KM_FN','T_KM_5','T_KM_10','T_KM_15','T_KM_20','T_KM_HM','T_KM_25','T_KM_30','T_KM_35','T_KM_40']]

df_tokyo_data_1.head()

Unnamed: 0,Jahr,Ort,Geschlecht,Vorname,Nachname,Platz,Datum,Startzeit,T_KM_FN,T_KM_5,T_KM_10,T_KM_15,T_KM_20,T_KM_HM,T_KM_25,T_KM_30,T_KM_35,T_KM_40
0,2007,Tokyo,M,Daniel Njenga,2,1,,,2:09:45,0:15:05,0:30:18,0:45:49,1:01:21,1:04:50,1:17:13,1:32:04,1:47:11,2:02:43
1,2007,Tokyo,M,Tomoyuki Sato,34,2,,,2:11:22,0:15:06,0:30:19,0:45:51,1:01:22,1:04:51,1:17:15,1:32:39,1:48:32,2:04:22
2,2007,Tokyo,M,Satoshi Irifune,32,3,,,2:12:44,0:15:05,0:30:18,0:45:50,1:01:21,1:04:50,1:17:15,1:32:39,1:48:32,2:05:05
3,2007,Tokyo,M,Masashi Hayashi,101,4,,,2:15:28,0:15:06,0:30:20,0:45:51,1:01:22,1:04:50,1:17:15,1:32:50,1:49:16,2:06:55
4,2007,Tokyo,M,Kazuyoshi Tokumoto,178,5,,,2:15:55,0:15:05,0:30:18,0:45:51,1:01:21,1:04:50,1:17:14,1:32:39,1:48:45,2:07:04


In [14]:
df_tokyo_data_csv_2.head()

Unnamed: 0,Jahr,Ges,Place_Overall,Number_Card,Name,Age_Group,Place_Age_Group,Place_Gender,Time_net,Time_gross,5K_Time,10K_Time,15K_Time,20K_Time,HK_Time,25K_Time,30K_Time,35K_Time,40K_Time,FK_Time
0,2015,M,1,5,ENDESHAW NEGESSE,Age Place(25-29),1,1,2:05:59,2:06:00,0:14:56,0:29:54,0:44:50,0:59:52,1:03:08,1:14:45,1:29:50,1:44:45,1:59:21,2:06:00
1,2015,M,2,3,STEPHEN KIPROTICH,Age Place(25-29),2,2,2:06:32,2:06:33,0:14:56,0:29:54,0:44:51,0:59:52,1:03:09,1:14:45,1:29:50,1:44:45,1:59:55,2:06:33
2,2015,M,3,4,DICKSON CHUMBA,Age Place(25-29),3,3,2:06:33,2:06:34,0:14:57,0:29:54,0:44:50,0:59:51,1:03:09,1:14:45,1:29:51,1:44:45,1:59:28,2:06:34
3,2015,M,4,8,SHUMI DECHASA,Age Place(25-29),4,4,2:07:19,2:07:20,0:14:57,0:29:54,0:44:49,0:59:51,1:03:09,1:14:45,1:29:50,1:44:44,2:00:21,2:07:20
4,2015,M,5,7,PETER SOME,Age Place(20-24),1,5,2:07:22,2:07:22,0:14:57,0:29:54,0:44:50,0:59:52,1:03:09,1:14:45,1:29:51,1:44:45,2:00:04,2:07:22


In [15]:
# Nicht benötigte Spalten entfernen (Jahr 2015 bis 2019)
df_tokyo_data_2 = df_tokyo_data_csv_2.drop(
    columns=['Place_Overall','Age_Group','Place_Age_Group','Time_net','Time_gross'
    ])
# Spalten hinzufügen (Jahr 2015 bis 2019)
df_tokyo_data_2['Datum'] = None
df_tokyo_data_2['Startzeit'] = None
df_tokyo_data_2['Ort'] = 'Tokyo'

# Spalten umbenennen (Jahr 2015 bis 2019)
df_tokyo_data_2 = df_tokyo_data_2.rename(
    columns={'Ges':'Geschlecht','Place_Gender':'Platz','FK_Time':'T_KM_FN','Name':'Vorname','Number_Card':'Nachname','FK_Time':'T_KM_FN',
    '5K_Time':'T_KM_5','10K_Time':'T_KM_10','15K_Time':'T_KM_15','20K_Time':'T_KM_20','HK_Time':'T_KM_HM','25K_Time':'T_KM_25','30K_Time':'T_KM_30','35K_Time':'T_KM_35','40K_Time':'T_KM_40'})

# Spalten umsortieren (Jahr 2015 bis 2019)
df_tokyo_data_2 = df_tokyo_data_2[['Jahr','Ort','Geschlecht','Vorname','Nachname','Platz','Datum','Startzeit','T_KM_FN','T_KM_5','T_KM_10','T_KM_15','T_KM_20','T_KM_HM','T_KM_25','T_KM_30','T_KM_35','T_KM_40']]

df_tokyo_data_2.head()

Unnamed: 0,Jahr,Ort,Geschlecht,Vorname,Nachname,Platz,Datum,Startzeit,T_KM_FN,T_KM_5,T_KM_10,T_KM_15,T_KM_20,T_KM_HM,T_KM_25,T_KM_30,T_KM_35,T_KM_40
0,2015,Tokyo,M,ENDESHAW NEGESSE,5,1,,,2:06:00,0:14:56,0:29:54,0:44:50,0:59:52,1:03:08,1:14:45,1:29:50,1:44:45,1:59:21
1,2015,Tokyo,M,STEPHEN KIPROTICH,3,2,,,2:06:33,0:14:56,0:29:54,0:44:51,0:59:52,1:03:09,1:14:45,1:29:50,1:44:45,1:59:55
2,2015,Tokyo,M,DICKSON CHUMBA,4,3,,,2:06:34,0:14:57,0:29:54,0:44:50,0:59:51,1:03:09,1:14:45,1:29:51,1:44:45,1:59:28
3,2015,Tokyo,M,SHUMI DECHASA,8,4,,,2:07:20,0:14:57,0:29:54,0:44:49,0:59:51,1:03:09,1:14:45,1:29:50,1:44:44,2:00:21
4,2015,Tokyo,M,PETER SOME,7,5,,,2:07:22,0:14:57,0:29:54,0:44:50,0:59:52,1:03:09,1:14:45,1:29:51,1:44:45,2:00:04


In [16]:
# Datensätze aus den Jahren 2007 bis 2014 mit den Datensätzen aus den Jahren 2015 bis 2019 zusammenführen
df_tokyo_data = pd.concat([df_tokyo_data_1,df_tokyo_data_2], ignore_index=True)

# Inhalt aus der ursprünglichen Spalte Name in Vorname und Nachname trennen
for index, row in df_tokyo_data.iterrows():
    #print(row['Vorname'])
    name_split = row['Vorname'].split(' ')
    if len(name_split) > 2:
        row['Nachname'] = name_split[1] + ' ' + name_split[2]
    else:
        row['Nachname'] = name_split[1]

    row['Vorname'] = name_split[0]
    
# Variable auf "none" setzen für spätere Verwendung
name_split = None

df_tokyo_data.head()

Unnamed: 0,Jahr,Ort,Geschlecht,Vorname,Nachname,Platz,Datum,Startzeit,T_KM_FN,T_KM_5,T_KM_10,T_KM_15,T_KM_20,T_KM_HM,T_KM_25,T_KM_30,T_KM_35,T_KM_40
0,2007,Tokyo,M,Daniel,Njenga,1,,,2:09:45,0:15:05,0:30:18,0:45:49,1:01:21,1:04:50,1:17:13,1:32:04,1:47:11,2:02:43
1,2007,Tokyo,M,Tomoyuki,Sato,2,,,2:11:22,0:15:06,0:30:19,0:45:51,1:01:22,1:04:51,1:17:15,1:32:39,1:48:32,2:04:22
2,2007,Tokyo,M,Satoshi,Irifune,3,,,2:12:44,0:15:05,0:30:18,0:45:50,1:01:21,1:04:50,1:17:15,1:32:39,1:48:32,2:05:05
3,2007,Tokyo,M,Masashi,Hayashi,4,,,2:15:28,0:15:06,0:30:20,0:45:51,1:01:22,1:04:50,1:17:15,1:32:50,1:49:16,2:06:55
4,2007,Tokyo,M,Kazuyoshi,Tokumoto,5,,,2:15:55,0:15:05,0:30:18,0:45:51,1:01:21,1:04:50,1:17:14,1:32:39,1:48:45,2:07:04


<hr>
<h1>Alle eingelesenen und harmonisierten Datensätze zusammenführen</h1>
<hr>

In [17]:
# Alle Datensätze zusammenführen
df_wmm_data_all = pd.concat([df_berlin_data,df_chicago_data,df_london_data,df_newyork_data,df_tokyo_data], ignore_index=True)
df_wmm_data_all.head()

Unnamed: 0,Jahr,Ort,Geschlecht,Vorname,Nachname,Platz,Datum,Startzeit,T_KM_FN,T_KM_5,T_KM_10,T_KM_15,T_KM_20,T_KM_HM,T_KM_25,T_KM_30,T_KM_35,T_KM_40
0,2019,Berlin,M,Kenenisa,Bekele,1,,,02:01:41,00:14:24,00:28:53,00:43:29,00:57:58,01:01:05,01:12:30,01:26:55,01:41:15,01:55:30
1,2019,Berlin,M,Birhanu,Legese,2,,,02:02:48,00:14:25,00:28:53,00:43:29,00:57:58,01:01:05,01:12:30,01:26:53,01:41:02,01:56:00
2,2019,Berlin,M,Sisay,Lemma,3,,,02:03:36,00:14:25,00:28:52,00:43:29,00:57:58,01:01:06,01:12:31,01:26:53,01:41:16,01:56:54
3,2019,Berlin,M,Jonathan,Korir,4,,,02:06:45,00:14:25,00:28:53,00:43:30,00:57:58,01:01:06,01:12:31,01:27:42,01:43:39,01:59:49
4,2019,Berlin,M,Felix,Kandie,5,,,02:08:07,00:14:43,00:29:30,00:44:19,00:59:06,01:02:20,01:14:04,01:28:58,01:44:10,02:00:27


In [18]:
# Alle Datensätze in eine CSV-Datei speichern.
file_name = main_data_path + "wmm_data/daten_wmm_all_harm.csv"
df_wmm_data_all.to_csv(file_name, sep=';', index=False)