# Exploration of the data scraped from the meetup past event page

This is a notebook to explore the data scraped from the continuous testing meetup past event page. 
For now it only compares the number of attendees and shows the temperature of the day of the event.
The theory is that the temperature of the day of the event will affect the number of attendees.

I have limited the events from 2019 to now, because the data before is not representative of the current meetup.


### Helper functions:

In [5]:
# Define helper functions here
import os
import time
import json
import requests
import pandas as pd
import plotly.offline as pyo
import plotly.graph_objects as go
from datetime import datetime, timedelta

# Constants
DATA_PATH = '../data/'
PLOTS_PATH = '../plots/'
DELAY = 1  # Delay in seconds to respect rate limits
WEATHER_DATA_FILE = os.path.join(DATA_PATH, 'weather_data.json')

def load_json_from_file(file_path):
    """
    Load JSON data from a file.
    If the file doesn't exist, create an empty JSON file and return an empty list.
    """
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
    except FileNotFoundError:
        with open(file_path, 'w') as f:
            json.dump([], f, indent=4)
        data = []
    return data

def save_json_to_file(data, file_path):
    """Save JSON data to a file."""
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=4)

def fetch_weather_data(event_id, date, is_forecast):
    """
    Fetch weather data from open-meteo API.
    The 'is_forecast' parameter indicates whether to fetch from the forecast or archive endpoint.
    """
    # Respect the rate limit
    time.sleep(DELAY)

    # API endpoint
    url = f"https://{'api.open-meteo.com/v1/forecast' if is_forecast else 'archive-api.open-meteo.com/v1/archive'}"

    # API parameters
    params = {
        'latitude': 52.52,
        'longitude': 13.41,
        'start_date': date,
        'end_date': date,
        'daily': 'temperature_2m_max',
        'timezone': 'Europe/Berlin'
    }

    # API request
    response = requests.get(url, params=params)
    data = response.json()

    # If there's an error in the response, return None
    if 'error' in data:
        print(f'No data available for {date}')
        return None

    # Construct weather data for the event
    weather_data = {'id': event_id}

    # If temperature data is available, add it to the weather data
    if 'daily' in data and 'temperature_2m_max' in data['daily'] and data['daily']['temperature_2m_max'][0] is not None:
        weather_data['max_temp'] = round(data['daily']['temperature_2m_max'][0], 2)
    
    return weather_data

def collect_weather_data(event_df, existing_weather_data, max_requests=None):
    """
    Collect weather data for the events in the event DataFrame.
    If 'max_requests' is provided, limit the number of API requests to that number.
    """
    # Initialize request count
    request_count = 0

    # Iterate over events
    for _, event in event_df.iterrows():
        event_id = int(event['id'])
        event_date = event['date'].to_pydatetime().replace(tzinfo=None)  # Make timezone naive
        is_forecast = event_date > datetime.now() - timedelta(days=5)

        # If weather data for the event does not exist or is incomplete, fetch it
        if not any(data.get('id') == event_id and data.get('max_temp') for data in existing_weather_data):
            weather_data = fetch_weather_data(event_id, str(event_date.date()), is_forecast)

            if weather_data is not None:
                existing_weather_data.append(weather_data)
                save_json_to_file(existing_weather_data, WEATHER_DATA_FILE)

            # Increment the request count
            request_count += 1

            # If we've reached the maximum number of requests, break the loop
            if max_requests is not None and request_count >= max_requests:
                break

### Load data, perform manipulation, and create visualization


In [6]:
# Load and process event data
event_df = pd.read_json(os.path.join(DATA_PATH, 'event_data.json'))

# Convert 'date' column to datetime format
event_df['date'] = pd.to_datetime(event_df['date'], format='%Y-%m-%d')

# Filter out data from before 2019
event_df = event_df[event_df['date'].dt.year >= 2019]

# Sort data chronologically
event_df.sort_values('date', inplace=True)

# Calculate average number of attendees and their difference from the average
average_attendees = event_df['attendees'].mean().round(0)
event_df['attendee_difference'] = event_df['attendees'] - average_attendees

# Load existing weather data
existing_weather_data = load_json_from_file(WEATHER_DATA_FILE)

# Collect weather data
collect_weather_data(event_df, existing_weather_data)

# Load updated weather data
weather_df = pd.DataFrame(load_json_from_file(WEATHER_DATA_FILE))

# Merge event data with weather data
event_df = pd.merge(event_df, weather_df, on='id', how='left')

# Normalize temperature values to a 0-1 scale
min_temp = event_df['max_temp'].min()
max_temp = event_df['max_temp'].max()
event_df['normalized_temp'] = (event_df['max_temp'] - min_temp) / (max_temp - min_temp)

# Create custom hover text
event_df['hover_text'] = 'CTM #' + event_df['id'].astype(str) + '<br>' + event_df['title'] + '<br>' + 'Attendees: ' + event_df['attendees'].astype(str)

# If max temperature data is available, add it to the hover text
event_df.loc[event_df['max_temp'].notnull(), 'hover_text'] += '<br>' + 'Max Temp: ' + event_df['max_temp'].astype(str) + '°C'

# Create the plot
fig = go.Figure()

fig.add_trace(go.Bar(x=event_df['id'], y=event_df['attendee_difference'], hovertext=event_df['hover_text'], hoverinfo='text',
                     marker=dict(color=event_df['normalized_temp'], colorscale='RdBu_r', opacity=0.6),  # Using reversed 'RdBu' color scale and some transparency
                     marker_line_width=0,  # No borders around bars
                     yaxis='y1'))

fig.add_trace(go.Scatter(x=event_df['id'], y=event_df['attendees'], hovertext=event_df['hover_text'], hoverinfo='text',
                         mode='text',
                         line=dict(color='rgb(255,255,255)'),
                         showlegend=False,
                         yaxis='y2'))

# Configure plot layout
fig.update_layout(xaxis=dict(showticklabels=False),
                  yaxis1=dict(title='Attendees Compared to Average', showgrid=False, zeroline=False),
                  yaxis2=dict(title='Total Attendees', showgrid=False, zeroline=False, overlaying='y', side='right'),
                  plot_bgcolor='rgb(40,40,40)', 
                  paper_bgcolor='rgb(40,40,40)', 
                  font=dict(color='rgb(255,255,255)'),
                  title_text='Attendees Compared to Average Over Time',
                  showlegend=False)

# Display the figure and save it as HTML
fig.show()
pyo.plot(fig, filename=os.path.join(PLOTS_PATH, 'event_analysis_plot.html'))

'../plots/event_analysis_plot.html'