# Requirements

In [None]:
!pip install -r requirements.txt

# Imports

In [1]:
from scrapers.iloveqatar import ILoveQatarScraper
from scrapers.visitqatar import VisitQatarScraper
from scrapers.qatarmuseums import QatarMuseumsScraper
from models import Event
from typing import List
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
import os

# Authenticate with Google Sheets

In [2]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials

scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
creds = ServiceAccountCredentials.from_json_keyfile_name("credentials.json", scope)
client = gspread.authorize(creds)

# Open Sheets

In [3]:
# Open the spreadsheet by name
spreadsheet = client.open("EventScrapes")

# Access or create worksheets for each source
worksheet_names = ["ILoveQatar", "VisitQatar", "QatarMuseums"]
worksheets = {}

for name in worksheet_names:
    try:
        worksheets[name] = spreadsheet.worksheet(name)
    except gspread.exceptions.WorksheetNotFound:
        worksheets[name] = spreadsheet.add_worksheet(title=name, rows="1000", cols="20")


# Config

In [4]:
# Choose which scrapers to run
# You can remove a scraper by prefixing it with #, eg #ILoveQatarScraper(),
scrapers = [
    ILoveQatarScraper(2),
    VisitQatarScraper(),
    QatarMuseumsScraper(2),
]

# Allows you to save results for each source as well
save_individual_results = False
save_to_google_sheets = True

# Initialize functions

In [5]:
def run_scrapers(scrapers: list) -> List[Event]:
    all_events = []
    for scraper in scrapers:
        try:
            print(f"\n{'='*50}")
            print(f"Running {scraper.source_name} scraper...")
            events = scraper.scrape_events()
            if save_to_google_sheets:
                events_df = pd.DataFrame([event.to_dict() for event in events])
                worksheet = worksheets[scraper.source_name]
                append_new_events_to_sheet(events_df, worksheet)
            all_events.extend(events)
            print(f"Found {len(events)} events from {scraper.source_name}")
            
            # Save individual scraper results
            if save_individual_results:
                scraper.save_to_csv(events)
        except Exception as e:
            print(f"Error with {scraper.source_name} scraper: {e}")
    
    return all_events

def append_new_events_to_sheet(events_df, worksheet):
    # Sanitize any list-type values (Google Sheets does not support lists)
    def sanitize(value):
        if isinstance(value, list):
            return ", ".join(str(v) for v in value)
        return value

    events_df = events_df.copy()
    events_df["unique_key"] = events_df["title"] + events_df["start_date"] + events_df["location"]
    sanitized_df = events_df.applymap(sanitize)

    # Get existing records to detect duplicates
    existing_records = worksheet.get_all_records()
    if existing_records:
        existing_df = pd.DataFrame(existing_records)
        existing_df["unique_key"] = existing_df["title"] + existing_df["start_date"] + existing_df["location"]
        new_events_df = sanitized_df[~sanitized_df["unique_key"].isin(existing_df["unique_key"])]
    else:
        new_events_df = sanitized_df
        # Also write headers if sheet is empty
        worksheet.update([list(events_df.columns[:-1])], range_name="A1")

    if new_events_df.empty:
        print(f"No new events to add to {worksheet.title}.")
        return

    # Insert new rows at the top (after headers)
    insert_rows = new_events_df.drop(columns=["unique_key"]).values.tolist()
    ##### Batch the updates #####
    existing = worksheet.get_all_values()
    num_new = len(insert_rows)
    
    # Build the new values (headers + new + existing)
    new_values = [existing[0]] if existing else [list(events_df.columns[:-1])]
    new_values += insert_rows
    if existing:
        new_values += existing[1:]
    
    # Overwrite the entire sheet in one write
    worksheet.update(new_values)
    print(f"Inserted {num_new} new events into {worksheet.title}.")


    print(f"Inserted {len(insert_rows)} new events into {worksheet.title}.")

# Run 

In [6]:
all_events = run_scrapers(scrapers)


Running ILoveQatar scraper...
Scraping page 1...
Scraping page 2...


  sanitized_df = events_df.applymap(sanitize)


Inserted 10 new events into ILoveQatar.
Inserted 10 new events into ILoveQatar.
Found 20 events from ILoveQatar

Running VisitQatar scraper...


  sanitized_df = events_df.applymap(sanitize)


No new events to add to VisitQatar.
Found 74 events from VisitQatar

Running QatarMuseums scraper...
Scraping page 1...
Scraping page 2...


  sanitized_df = events_df.applymap(sanitize)


Inserted 11 new events into QatarMuseums.
Inserted 11 new events into QatarMuseums.
Found 26 events from QatarMuseums


In [None]:
events_df = pd.DataFrame([event.to_dict() for event in all_events])
events_df.head()

# Statistics

In [None]:
print("\nEvent Statistics:")
print(f"Total events: {len(all_events)}")
# By source
source_counts = events_df['source'].value_counts()
print("\nBy source:")
print(source_counts)
# By category (if available)
if 'category' in events_df.columns:
    category_counts = events_df['category'].value_counts()
    print("\nBy category:")
    print(category_counts)

In [None]:
plt.figure(figsize=(10, 5))
source_counts.plot(kind='bar', color=['skyblue', 'lightgreen'])
plt.title('Number of Events by Source')
plt.xlabel('Source')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

In [None]:
if 'category' in events_df.columns:
    plt.figure(figsize=(12, 6))
    category_counts.plot(kind='bar', color='lightcoral')
    plt.title('Number of Events by Category')
    plt.xlabel('Category')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

# Save results

In [None]:
combined_filename = "combined_events.csv"

# Convert events to DataFrame
new_combined_df = pd.DataFrame([e.to_dict() for e in all_events])

# Append if file exists, avoid duplicates
if os.path.exists(combined_filename):
    existing_df = pd.read_csv(combined_filename)
    combined_df = pd.concat([existing_df, new_combined_df], ignore_index=True)
    combined_df.drop_duplicates(subset=["title", "start_date", "location"], inplace=True)
else:
    combined_df = new_combined_df

combined_df.to_csv(combined_filename, index=False)
print(f"\nSaved {len(combined_df)} total unique events to {combined_filename}")

# Raw inspection

In [None]:
print("\nSample Event Details:")
for idx, event in enumerate(all_events[:3], 1):  # Show first 3 events
    print(f"\nEvent {idx}:")
    print(f"Title: {event.title}")
    print(f"Date: {event.start_date} to {event.end_date}")
    print(f"Time: {event.time}")
    print(f"Location: {event.location}")
    print(f"Source: {event.source}")