### Prepping Data Challenge:  (week 29)

### Input
The data comes from the Olympics website. (Note; this was taken on Wednesday 14th July so the schedule for some events may have changed since!). 

### Requirements
 - Input the Data 
 - Create a correctly formatted DateTime field 
 - Parse the event list so each event is on a separate row 
 - Group similar sports into a Sport Type field 
 - Combine the Venue table 
 - Calculate whether the event is a 'Victory Ceremony' or 'Gold Medal' event. (Note, this might not pick up all of the medal events.)
 - Output the Data

In [1]:
import pandas as pd
import re

In [2]:
sports_act = {'Artistic Gymnastic':'^Artistic Gymanstic.*','Baseball/Softball':'^(Baseball/Softball).*','Beach Volleyball':'^Beach Volley.*',
             'Boxing':'^Boxing.*', 'Rugby':'Rugby.*','Skateboarding':'Skateboarding.*','Wrestling':'Wrestling.*'}

In [3]:
#Input the data
with pd.ExcelFile('WK29-Olympic Events.xlsx') as xlsx:
    events = pd.read_excel(xlsx, 'Olympics Events')
    venues = pd.read_excel(xlsx, 'Venues')

  warn(msg)


In [4]:
events.head()

Unnamed: 0,Date,Time,Sport,Venue,Events
0,21st_July_2021,1:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,"Australia vs Japan, Italy vs United States, Me..."
1,21st_July_2021,8:30,Football,Sapporo Dome,"Women's Group E: Great Britain vs Chile, Women..."
2,21st_July_2021,9:00,Football,Miyagi Stadium,"Women's Group F: China vs Brazil, Women's Grou..."
3,21st_July_2021,9:30,Football,Tokyo Stadium,"Women's Group G: Sweden vs United States, Wome..."
4,22nd_July_2021,1:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,"United States vs Canada, Mexico vs Japan, Ital..."


In [5]:
venues.head()

Unnamed: 0,Venue,Sport,Location
0,Olympic Stadium,Opening Ceremony,"35.67786383266573, 139.71366292613558"
1,Olympic Stadium,Closing Ceremony,"35.67786383266573, 139.71366292613558"
2,Olympic Stadium,Athletics,"35.67786383266573, 139.71366292613558"
3,Olympic Stadium,Football,"35.67786383266573, 139.71366292613558"
4,Tokyo Metropolitan Gymnasium,Table Tennis,"35.679538129089025, 139.71224149090568"


In [6]:
events['UK Date Time'] = pd.to_datetime(events['Date'].str.replace('(?<=\d)[a-z]+', '') + ' ' 
                                         + events['Time'].str.replace('xx', '0:00'), 
                                     format='%d_%B_%Y %H:%M')
events['Date'] = events['UK Date Time'].dt.date

  


In [7]:
#clean the sport name 
for correct, pattern in sports_act.items():
    events['Sport Group'] = events['Sport'].str.title().replace(to_replace = pattern, value = correct, regex = True)

In [8]:
events.head()

Unnamed: 0,Date,Time,Sport,Venue,Events,UK Date Time,Sport Group
0,2021-07-21,1:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,"Australia vs Japan, Italy vs United States, Me...",2021-07-21 01:00:00,Baseball/Softball
1,2021-07-21,8:30,Football,Sapporo Dome,"Women's Group E: Great Britain vs Chile, Women...",2021-07-21 08:30:00,Football
2,2021-07-21,9:00,Football,Miyagi Stadium,"Women's Group F: China vs Brazil, Women's Grou...",2021-07-21 09:00:00,Football
3,2021-07-21,9:30,Football,Tokyo Stadium,"Women's Group G: Sweden vs United States, Wome...",2021-07-21 09:30:00,Football
4,2021-07-22,1:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,"United States vs Canada, Mexico vs Japan, Ital...",2021-07-22 01:00:00,Baseball/Softball


In [9]:
#Parse the event list so each event is on a separate row 
events = pd.concat([events[['Date','UK Date Time','Sport Group','Venue']],
                   pd.DataFrame([map(str.strip, x) \
                                 for x in events['Events'].str.split(',').values.tolist()])],
                  axis=1, sort=False)

In [10]:
#Combine the Venue table 
events = events.melt(id_vars=[s for s in events.columns if not re.match('\d', str(s))],
                    value_name='Event', var_name='ToDrop')\
                         .drop('ToDrop', axis=1).dropna(subset=['Event']).reset_index(drop=True)

In [11]:
events.head()

Unnamed: 0,Date,UK Date Time,Sport Group,Venue,Event
0,2021-07-21,2021-07-21 01:00:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,Australia vs Japan
1,2021-07-21,2021-07-21 08:30:00,Football,Sapporo Dome,Women's Group E: Great Britain vs Chile
2,2021-07-21,2021-07-21 09:00:00,Football,Miyagi Stadium,Women's Group F: China vs Brazil
3,2021-07-21,2021-07-21 09:30:00,Football,Tokyo Stadium,Women's Group G: Sweden vs United States
4,2021-07-22,2021-07-22 01:00:00,Baseball/Softball,Fukushima Azuma Baseball Stadium,United States vs Canada


In [12]:
#Calculate whether the event is a 'Victory Ceremony' or 'Gold Medal' event. (Note, this might not pick up all of the medal events.)
events['Medal Ceremony'] = events['Event'].apply(lambda x: bool(re.search('Victory Ceremony', x)) or bool(re.search('Gold Medal', x)))
events['Venue'] = events['Venue'].str.title()

In [13]:
venues = venues[['Venue','Location']].drop_duplicates()
temp = venues['Location'].str.split(',', n=1, expand = True)
venues['Latitude'] = temp[0].astype('float')
venues['Longitude'] = temp[1].astype('float')
venues = venues.drop('Location', axis=1)
venues['Venue'] = venues['Venue'].str.title()

In [14]:
output = pd.merge(events, venues, on='Venue')

In [16]:
output = output[['Latitude','Longitude','Medal Ceremony','Sport Group','Event','UK Date Time','Date','Venue']]

In [17]:
output.head()

Unnamed: 0,Latitude,Longitude,Medal Ceremony,Sport Group,Event,UK Date Time,Date,Venue
0,37.722165,140.364011,False,Baseball/Softball,Australia vs Japan,2021-07-21 01:00:00,2021-07-21,Fukushima Azuma Baseball Stadium
1,37.722165,140.364011,False,Baseball/Softball,United States vs Canada,2021-07-22 01:00:00,2021-07-22,Fukushima Azuma Baseball Stadium
2,37.722165,140.364011,False,Baseball,Baseball Opening Round,2021-07-28 04:00:00,2021-07-28,Fukushima Azuma Baseball Stadium
3,37.722165,140.364011,False,Baseball/Softball,Italy vs United States,2021-07-21 01:00:00,2021-07-21,Fukushima Azuma Baseball Stadium
4,37.722165,140.364011,False,Baseball/Softball,Mexico vs Japan,2021-07-22 01:00:00,2021-07-22,Fukushima Azuma Baseball Stadium


In [17]:
with pd.ExcelWriter('wk29-Output.xlsx') as xlsx:
    output.to_excel(xlsx, index=False, sheet_name='Event Schedule')