In [287]:
# https://preppindata.blogspot.com/2021/07/2021-week-29-pd-x-wow-tokyo-2020.html

import pandas as pd
import numpy as np
from datetime import date

### Input the Data

In [288]:
df_events = pd.read_excel(r'data\PD 2021 Wk 29 Input.xlsx', sheet_name='Olympics Events')
df_venues = pd.read_excel(r'data\PD 2021 Wk 29 Input.xlsx', sheet_name='Venues')
df_events

Unnamed: 0,Date,Time,Sport,Venue,Events
0,21st_July_2021,1:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,"Australia vs Japan, Italy vs United States, Me..."
1,21st_July_2021,8:30,Football,Sapporo Dome,"Women's Group E: Great Britain vs Chile, Women..."
2,21st_July_2021,9:00,Football,Miyagi Stadium,"Women's Group F: China vs Brazil, Women's Grou..."
3,21st_July_2021,9:30,Football,Tokyo Stadium,"Women's Group G: Sweden vs United States, Wome..."
4,22nd_July_2021,1:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,"United States vs Canada, Mexico vs Japan, Ital..."
...,...,...,...,...,...
704,8th_August_2021,5:40,Water Polo,Tatsumi Water Polo Centre,Men's Bronze Medal Match
705,8th_August_2021,6:00,Boxing.,Kokugikan Arena,"Women's Light (57-60kg) Final, Men's Light (57..."
706,8th_August_2021,7:00,Handball,Yoyogi National Stadium,Women's Gold Medal Match
707,8th_August_2021,8:30,Water Polo,Tatsumi Water Polo Centre,Men's Gold Medal Match


### Create a correctly formatted DateTime field

In [289]:
#first see the format problem of the date
df_events['Date'].unique()

array(['21st_July_2021', '22nd_July_2021', '22_July_2021',
       '23rd_July_2021', '24th_July_2021', '25th_July_2021',
       '26th_July_2021', '27th_July_2021', '28th_July_2021',
       '29th_July_2021', '30th_July_2021', '31st_July_2021',
       '1st_August_2021', '2nd_August_2021', '3rd_August_2021',
       '4th_August_2021', '5th_August_2021', '6th_August_2021',
       '7th_August_2021', '8th_August_2021'], dtype=object)

In [290]:
df_events['Date'] = df_events['Date'].str.replace('_',' ')
df_events['Date'] = df_events['Date'].str.replace(r'(?<=\d)(st|nd|rd|th)','', regex=True)
df_events['Date'].unique()

array(['21 July 2021', '22 July 2021', '23 July 2021', '24 July 2021',
       '25 July 2021', '26 July 2021', '27 July 2021', '28 July 2021',
       '29 July 2021', '30 July 2021', '31 July 2021', '1 August 2021',
       '2 August 2021', '3 August 2021', '4 August 2021', '5 August 2021',
       '6 August 2021', '7 August 2021', '8 August 2021'], dtype=object)

In [291]:
# next, see the format problem of time
df_events['Time'].unique()

array(['1:00', '8:30', '9:00', '9:30', '0:30', '5:00', '12:00', '1:30',
       '1:50', '2:00', '2:15', '3:00', '5:50', '6:00', '6:15', '6:20',
       '6:30', '7:00', '10:00', '10:20', '10:30', '11:00', '11:30',
       '11:40', '13:00', '23:00', '2:30', '3:50', '4:00', '5:40', '5:45',
       '7:10', '7:50', '9:20', '11:50', '12:20', '22:30', '5:30', '8:00',
       '11:45', '12:30', '0:00', '3:30', '11:15', '12:45', 'xx', '23:30',
       '0:45', '6:45', '7:30', '2:10', '23:45', '1:10', '10:15', '5:15',
       '13:30', '21:30', '2:20', '6:50', '7:20', '10:45', '13:15'],
      dtype=object)

In [292]:
df_events['Time'] = df_events['Time'].str.replace('xx','0:00')

In [293]:
df_events['Datetime'] = df_events['Date'].astype(str) + ' ' + df_events['Time'].astype(str)
df_events['Datetime'] =  pd.to_datetime(df_events['Datetime'], format='%d %B %Y %H:%M')
df_events

Unnamed: 0,Date,Time,Sport,Venue,Events,Datetime
0,21 July 2021,1:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,"Australia vs Japan, Italy vs United States, Me...",2021-07-21 01:00:00
1,21 July 2021,8:30,Football,Sapporo Dome,"Women's Group E: Great Britain vs Chile, Women...",2021-07-21 08:30:00
2,21 July 2021,9:00,Football,Miyagi Stadium,"Women's Group F: China vs Brazil, Women's Grou...",2021-07-21 09:00:00
3,21 July 2021,9:30,Football,Tokyo Stadium,"Women's Group G: Sweden vs United States, Wome...",2021-07-21 09:30:00
4,22 July 2021,1:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,"United States vs Canada, Mexico vs Japan, Ital...",2021-07-22 01:00:00
...,...,...,...,...,...,...
704,8 August 2021,5:40,Water Polo,Tatsumi Water Polo Centre,Men's Bronze Medal Match,2021-08-08 05:40:00
705,8 August 2021,6:00,Boxing.,Kokugikan Arena,"Women's Light (57-60kg) Final, Men's Light (57...",2021-08-08 06:00:00
706,8 August 2021,7:00,Handball,Yoyogi National Stadium,Women's Gold Medal Match,2021-08-08 07:00:00
707,8 August 2021,8:30,Water Polo,Tatsumi Water Polo Centre,Men's Gold Medal Match,2021-08-08 08:30:00


### Parse the event list so each event is on a separate row

In [294]:
df_events['Events'] = df_events['Events'].str.split(', ')
df_events = df_events.explode('Events')
df_events

Unnamed: 0,Date,Time,Sport,Venue,Events,Datetime
0,21 July 2021,1:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,Australia vs Japan,2021-07-21 01:00:00
0,21 July 2021,1:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,Italy vs United States,2021-07-21 01:00:00
0,21 July 2021,1:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,Mexico vs Canada,2021-07-21 01:00:00
1,21 July 2021,8:30,Football,Sapporo Dome,Women's Group E: Great Britain vs Chile,2021-07-21 08:30:00
1,21 July 2021,8:30,Football,Sapporo Dome,Women's Group E: Japan vs Canada,2021-07-21 08:30:00
...,...,...,...,...,...,...
705,8 August 2021,6:00,Boxing.,Kokugikan Arena,Men's Light (57-63kg) Victory Ceremony,2021-08-08 06:00:00
705,8 August 2021,6:00,Boxing.,Kokugikan Arena,Men's Super Heavy (+91kg) Final,2021-08-08 06:00:00
706,8 August 2021,7:00,Handball,Yoyogi National Stadium,Women's Gold Medal Match,2021-08-08 07:00:00
707,8 August 2021,8:30,Water Polo,Tatsumi Water Polo Centre,Men's Gold Medal Match,2021-08-08 08:30:00


### Group similar sports into a Sport Type field

In [295]:
#see the problem
df_events['Sport Group'] = df_events['Sport'].str.lower()
ls_sport = list(df_events['Sport Group'].sort_values().unique())
ls_sport

['3x3 basketball',
 'archery',
 'artistic gymnastic',
 'artistic gymnastics',
 'artistic swimming',
 'athletics',
 'badminton',
 'baseball',
 'baseball/softball',
 'basketball',
 'beach volley',
 'beach volleybal',
 'beach volleyball',
 'boxing',
 'boxing.',
 'canoe slalom',
 'canoe sprint',
 'closing ceremony',
 'cycling bmx freestyle',
 'cycling bmx racing',
 'cycling mountain bike',
 'cycling road',
 'cycling track',
 'diving',
 'equestrian',
 'fencing',
 'football',
 'golf',
 'handball',
 'hockey',
 'judo',
 'karate',
 'marathon swimming',
 'modern pentathlon',
 'opening ceremony',
 'rhythmic gymnastics',
 'rowing',
 'rugby',
 'rugby.',
 'sailing',
 'shooting',
 'skateboarding',
 'skateboarding.',
 'softball',
 'softball/baseball',
 'sport climbing',
 'surfing',
 'swimming',
 'table tennis',
 'taekwondo',
 'tennis',
 'trampoline gymnastics',
 'triathlon',
 'volleyball',
 'water polo',
 'weightlifting',
 'wrestling',
 'wrestling.']

In [296]:
#remove the dots
df_events['Sport Group'] = df_events['Sport Group'].str.replace('.','')


#all contains gymnastic is gymnastics
df_events['Sport Group'] = df_events['Sport Group'].str.replace('.*basketball.*','Basketball', regex=True)

#all contains gymnastic is gymnastics
df_events['Sport Group'] = df_events['Sport Group'].str.replace('.*gymnastic.*','Gymnastics', regex=True)

#all contains baseball is baseball
df_events['Sport Group'] = df_events['Sport Group'].str.replace('.*baseball.*','Baseball', regex=True)

#all contains volley is volleyball
df_events['Sport Group'] = df_events['Sport Group'].str.replace('.*volley.*','Volleyball', regex=True)

#all contains swimming is swimming
df_events['Sport Group'] = df_events['Sport Group'].str.replace('.*swimming.*','Swimming', regex=True)

#all contains tennis is tennis
df_events['Sport Group'] = df_events['Sport Group'].str.replace('.*tennis.*','Tennis', regex=True)

#all starts with cycling is cycling 
df_events['Sport Group'] = df_events['Sport Group'].str.replace('^cycling.*','Cycling', regex=True)

#all ends with ceremony is ceremony 
df_events['Sport Group'] = df_events['Sport Group'].str.replace('.*ceremony$','Ceremony', regex=True)

#all starts with canoe is canoeing 
df_events['Sport Group'] = df_events['Sport Group'].str.replace('canoe.*','Canoeing', regex=True)

#3 of them are Martial Arts
df_events['Sport Group'] = df_events['Sport Group'].str.replace('(taekwondo|judo|karate)','Martial Arts', regex=True)

#to title case
df_events['Sport Group'] = df_events['Sport Group'].str.title()

#checking this list each time doing the transformation
ls_sport = list(df_events['Sport Group'].sort_values().unique())
ls_sport

['Archery',
 'Athletics',
 'Badminton',
 'Baseball',
 'Basketball',
 'Boxing',
 'Canoeing',
 'Ceremony',
 'Cycling',
 'Diving',
 'Equestrian',
 'Fencing',
 'Football',
 'Golf',
 'Gymnastics',
 'Handball',
 'Hockey',
 'Martial Arts',
 'Modern Pentathlon',
 'Rowing',
 'Rugby',
 'Sailing',
 'Shooting',
 'Skateboarding',
 'Softball',
 'Sport Climbing',
 'Surfing',
 'Swimming',
 'Tennis',
 'Triathlon',
 'Volleyball',
 'Water Polo',
 'Weightlifting',
 'Wrestling']

In [297]:
df_events

Unnamed: 0,Date,Time,Sport,Venue,Events,Datetime,Sport Group
0,21 July 2021,1:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,Australia vs Japan,2021-07-21 01:00:00,Baseball
0,21 July 2021,1:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,Italy vs United States,2021-07-21 01:00:00,Baseball
0,21 July 2021,1:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,Mexico vs Canada,2021-07-21 01:00:00,Baseball
1,21 July 2021,8:30,Football,Sapporo Dome,Women's Group E: Great Britain vs Chile,2021-07-21 08:30:00,Football
1,21 July 2021,8:30,Football,Sapporo Dome,Women's Group E: Japan vs Canada,2021-07-21 08:30:00,Football
...,...,...,...,...,...,...,...
705,8 August 2021,6:00,Boxing.,Kokugikan Arena,Men's Light (57-63kg) Victory Ceremony,2021-08-08 06:00:00,Boxing
705,8 August 2021,6:00,Boxing.,Kokugikan Arena,Men's Super Heavy (+91kg) Final,2021-08-08 06:00:00,Boxing
706,8 August 2021,7:00,Handball,Yoyogi National Stadium,Women's Gold Medal Match,2021-08-08 07:00:00,Handball
707,8 August 2021,8:30,Water Polo,Tatsumi Water Polo Centre,Men's Gold Medal Match,2021-08-08 08:30:00,Water Polo


### Combine the Venue table

In [298]:
df_venues.head(3)

Unnamed: 0,Venue,Sport,Location
0,Olympic Stadium,Opening Ceremony,"35.67786383266573, 139.71366292613558"
1,Olympic Stadium,Closing Ceremony,"35.67786383266573, 139.71366292613558"
2,Olympic Stadium,Athletics,"35.67786383266573, 139.71366292613558"


In [299]:
# Combine the Venue table
df_venues[['Latitude','Longitude']] = df_venues['Location'].str.split(', ', expand=True)
df_venues.head()

Unnamed: 0,Venue,Sport,Location,Latitude,Longitude
0,Olympic Stadium,Opening Ceremony,"35.67786383266573, 139.71366292613558",35.67786383266573,139.71366292613558
1,Olympic Stadium,Closing Ceremony,"35.67786383266573, 139.71366292613558",35.67786383266573,139.71366292613558
2,Olympic Stadium,Athletics,"35.67786383266573, 139.71366292613558",35.67786383266573,139.71366292613558
3,Olympic Stadium,Football,"35.67786383266573, 139.71366292613558",35.67786383266573,139.71366292613558
4,Tokyo Metropolitan Gymnasium,Table Tennis,"35.679538129089025, 139.71224149090568",35.679538129089025,139.71224149090568


In [300]:
df_events_w_venue = df_events.merge(df_venues, on=['Venue','Sport'])
df_events_w_venue.head(3)

Unnamed: 0,Date,Time,Sport,Venue,Events,Datetime,Sport Group,Location,Latitude,Longitude
0,21 July 2021,1:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,Australia vs Japan,2021-07-21 01:00:00,Baseball,"37.72216480340486, 140.3640114979229",37.72216480340486,140.3640114979229
1,21 July 2021,1:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,Italy vs United States,2021-07-21 01:00:00,Baseball,"37.72216480340486, 140.3640114979229",37.72216480340486,140.3640114979229
2,21 July 2021,1:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,Mexico vs Canada,2021-07-21 01:00:00,Baseball,"37.72216480340486, 140.3640114979229",37.72216480340486,140.3640114979229


### Calculate whether the event is a 'Victory Ceremony' or 'Gold Medal' event. (Note, this might not pick up all of the medal events.)

In [301]:
df_events_w_venue['Medal Ceremony?'] = df_events_w_venue['Events'].str.contains('victory ceremony|gold medal', case=False, regex=True)
df_events_w_venue = df_events_w_venue[['Latitude','Longitude','Medal Ceremony?','Sport Group','Events','Datetime','Sport','Venue']]



### Output the Data

In [302]:
df_events_w_venue.to_csv(r'output/2021-week29-output.csv')