## 📅 GWU Events Scraper Challenge 📅


In [None]:
from requests.models import Response
from bs4 import BeautifulSoup
import requests
import pandas as pd
pd.set_option('display.max_columns', None)
from IPython.display import display, Image
EVENTS_PAGE_URL = "https://calendar.gwu.edu/calendar"

response = requests.get(EVENTS_PAGE_URL)

if response.status_code == 200:
    html_content = response.text
else:
    print(f"Failed to retrieve the webpage. HTTP Status Code: {response.status_code}")

soup = BeautifulSoup(html_content, 'html.parser')


In [None]:
eventsraw = soup.find_all('div', class_=lambda x: x and x.startswith('em-card '))
raw_events_list = []
#count = 0


for event in eventsraw:

    img_container = event.find('div', class_='em-card_image')
    img_src = img_container.find('img', class_='img_card')['src'] if img_container else None


    text_container = event.find('div', class_='em-card_text')
    title = text_container.find('h3', class_='em-card_title').text if text_container else None
    event_texts = text_container.find_all('p', class_='em-card_event-text') if text_container else None
    date_info = event_texts[0].text if event_texts and len(event_texts) > 0 else None
    location = event_texts[1].text if event_texts and len(event_texts) > 1 else None


    tags_container = text_container.find('div', class_='em-list_tags') if text_container else None
    tags = tags_container.find_all('a', class_='em-card_tag') if tags_container else None

    if tags:
        primary_event_type = [tag.text for tag in tags if tag.text][0]
    else:
        primary_event_type = None

    raw_events_list.append({
        "Title": title,
        "Date Info": date_info,
        "Location": location,
        "Image": img_src,
        "Primary Event Type": primary_event_type
    })


In [None]:
events = []

for event in raw_events_list:
    cleaned_event = {key: value.replace('\n', ' ').strip() if isinstance(value, str) else value for key, value in event.items()}
    events.append(cleaned_event)


for event in events:
  print(f"Title: {event['Title']}")
  print(f"Date: {event['Date Info']}")
  print(f"Location: {event['Location']}")
  if event["Image"]:
    display(Image(url=event["Image"], height = 90, width = 90))
  print(f"Event Type: {event['Primary Event Type']}")
  print("************************************************************")


Title: The Art of Collecting: Gifts from the Luther W. Brady Estate
Date: Wed, Oct 11, 2023
Location: Luther W. Brady Art Gallery


Event Type: Exhibition
************************************************************
Title: next NEXT_
Date: Wed, Oct 11, 2023
Location: Flagg Building, Student Lounge & Gallery 7


Event Type: Arts & Culture
************************************************************
Title: Alumni in Finance & Real Estate: Industry Networking Breakfast (NYC)
Date: Wed, Oct 11, 2023 8:30am to 9:30am
Location: None


Event Type: Alumni
************************************************************
Title: GWSB MS in Information Systems Technology  Information Session & Webinar
Date: Wed, Oct 11, 2023 9am to 9:30am
Location: Virtual Event


Event Type: Admissions
************************************************************
Title: Anne Lindberg: what color is divine light?
Date: Wed, Oct 11, 2023 10am to 5pm
Location: The George Washington Museum and the Textile Museum


Event Type: Arts & Culture
************************************************************
Title: Classical Washington
Date: Wed, Oct 11, 2023 10am to 5pm
Location: The George Washington Museum and the Textile Museum


Event Type: Arts & Culture
************************************************************
Title: Handstitched Worlds: The Cartography of Quilts
Date: Wed, Oct 11, 2023 10am to 5pm
Location: The George Washington Museum and the Textile Museum


Event Type: Arts & Culture
************************************************************
Title: The New Naval and Military Map of the United States
Date: Wed, Oct 11, 2023 10am to 5pm
Location: The George Washington Museum and the Textile Museum


Event Type: Arts & Culture
************************************************************
Title: From Spark to Impact: Promoting and Enabling Research with the Libraries and Academic Innovation Team
Date: Wed, Oct 11, 2023 11am to 12pm
Location: Gelman Library, Room 608


Event Type: Research
************************************************************
Title: GWSB MS in Project Management Information Session & Webinar
Date: Wed, Oct 11, 2023 12pm to 12:30pm
Location: Virtual Event


Event Type: Admissions
************************************************************
Title: International Education (Master's) - Virtual Information Session
Date: Wed, Oct 11, 2023 12pm to 1pm
Location: Virtual Event


Event Type: Admissions
************************************************************
Title: Bloomberg Industry Group Careers Panel
Date: Wed, Oct 11, 2023 12:30pm to 2pm
Location: Law School, Tasher Great Room


Event Type: Career & Professional Development
************************************************************
Title: From Spark to Impact: Software Systems for Researchers with Proposals and Awards
Date: Wed, Oct 11, 2023 1pm to 2pm
Location: Rome Hall (Academic Center), Room 206


Event Type: Research
************************************************************
Title: GW Collection / Corcoran Faculty Selection
Date: Wed, Oct 11, 2023 1pm to 5pm
Location: Flagg Building, Gallery 6


Event Type: Exhibition
************************************************************
Title: Global Reflections: A Virtual Group Processing Space
Date: Wed, Oct 11, 2023 1pm to 2:30pm
Location: Online


Event Type: Student Life
************************************************************
Title: Too Good to Be True? Beware of Job Offer Scams!
Date: Wed, Oct 11, 2023 2pm
Location: Virtual Event


Event Type: Info Session
************************************************************
Title: Dr. Sharon Murphy Book Talk
Date: Wed, Oct 11, 2023 2:30pm to 4pm
Location: None


Event Type: None
************************************************************
Title: Macro-International Seminar: Tony Zhang, Federal Reserve
Date: Wed, Oct 11, 2023 2:30pm to 4pm
Location: Hall of Government, 321


Event Type: Academic
************************************************************
Title: Accessibility On Campus Discussion
Date: Wed, Oct 11, 2023 3pm to 4pm
Location: Online


Event Type: Student Life
************************************************************
Title: Collaborative Reform Conference -- Effective Law Enforcement for All
Date: Wed, Oct 11, 2023 3pm to 7:30am
Location: Elliott School of International Affairs, GW University, City View Room


Event Type: Lectures & Speakers
************************************************************
Title: Inequality and the Crisis of Liberal Democracy
Date: Wed, Oct 11, 2023 4pm to 5pm
Location: 1957 E Street NW, 412Q


Event Type: Lectures & Speakers
************************************************************


In [None]:
for event in events:
    date_info = event.get('Date Info', '')

    for i in range(len(date_info) - 3):
        if date_info[i:i+4].isdigit():

            if date_info[i+4:i+5] != ',':

                new_date_info = date_info[:i+4] + ',' + date_info[i+4:]
                event['Date Info'] = new_date_info
            break


In [None]:
import re

for event in events:
    date_info = event.get('Date Info', '')


    match = re.match(r'(?P<day>\w+),\s(?P<month>\w+)\s(?P<date>\d+),\s(?P<year>\d+)(?:,\s)?(?P<time>.+)?', date_info)

    if match:
        day_of_week = match.group('day')
        month = match.group('month')
        date = match.group('date')
        year = match.group('year')


        month_dict = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'}
        numeric_month = month_dict.get(month[:3])

        formatted_date = f"{year}-{numeric_month}-{date.zfill(2)}"


        time_info = match.group('time') or ''
        if "to" in time_info:
            start_time, end_time = [t.strip() for t in time_info.split("to")]
        elif time_info and time_info != ',': # Only start time is available and ensure it's not just a comma
            start_time = time_info.strip()
            end_time = None
        else:
            start_time = end_time = None


        event['Date'] = formatted_date
        event['Day of Week'] = day_of_week
        event['Start Time'] = start_time
        event['End Time'] = end_time




In [None]:

events_df = pd.DataFrame(events)

pd.set_option('display.width', 1000)

print(events_df)

                                                Title                            Date Info                                           Location                                              Image                 Primary Event Type        Date Day of Week Start Time End Time
0   The Art of Collecting: Gifts from the Luther W...                   Wed, Oct 11, 2023,                        Luther W. Brady Art Gallery  https://localist-images.azureedge.net/photos/4...                         Exhibition  2023-10-11         Wed       None     None
1                                          next NEXT_                   Wed, Oct 11, 2023,         Flagg Building, Student Lounge & Gallery 7  https://localist-images.azureedge.net/photos/4...                     Arts & Culture  2023-10-11         Wed       None     None
2   Alumni in Finance & Real Estate: Industry Netw...  Wed, Oct 11, 2023, 8:30am to 9:30am                                               None  https://localist-images.azureedge.net/pho

In [None]:
print("Column Names:", events_df.columns)


Column Names: Index(['Title', 'Date Info', 'Location', 'Image', 'Primary Event Type', 'Date', 'Day of Week', 'Start Time', 'End Time'], dtype='object')


In [None]:

print("\nNumber of Rows:", len(events_df))



Number of Rows: 21


In [None]:
print("\nFirst Few Rows:")
print(events_df.head())



First Few Rows:
                                               Title                            Date Info                                           Location                                              Image Primary Event Type        Date Day of Week Start Time End Time
0  The Art of Collecting: Gifts from the Luther W...                   Wed, Oct 11, 2023,                        Luther W. Brady Art Gallery  https://localist-images.azureedge.net/photos/4...         Exhibition  2023-10-11         Wed       None     None
1                                         next NEXT_                   Wed, Oct 11, 2023,         Flagg Building, Student Lounge & Gallery 7  https://localist-images.azureedge.net/photos/4...     Arts & Culture  2023-10-11         Wed       None     None
2  Alumni in Finance & Real Estate: Industry Netw...  Wed, Oct 11, 2023, 8:30am to 9:30am                                               None  https://localist-images.azureedge.net/photos/4...             Alumni  2023-1

In [None]:
from datetime import datetime


def convert_time(t):
    if t:
        try:
            return datetime.strptime(t, '%I:%M%p').time()
        except ValueError:
            return datetime.strptime(t, '%I%p').time()
    return None


events_df_copy = events_df.copy()

events_df_copy['Start Time'] = events_df_copy['Start Time'].apply(convert_time)
events_df_copy['End Time'] = events_df_copy['End Time'].apply(convert_time)

earliest_start = min(filter(None, events_df_copy['Start Time']))
latest_end = max(filter(None, events_df_copy['End Time']))

print("Earliest Start Time:", earliest_start)
print("Latest End Time:", latest_end)



Earliest Start Time: 08:30:00
Latest End Time: 17:00:00


In [None]:
from google.colab import files
import pandas as pd

events_df.to_csv('events.csv', index=False)

files.download('events.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>