In [24]:
import requests
from bs4 import BeautifulSoup
import openai
import json
import tiktoken

# Set your OpenAI API key
openai.api_key = OPENAI_KEY

# Set your Google Cloud API key for Geocoding
GOOGLE_API_KEY = GOOGLE_GEOCODING_API_KEY

# Function to count tokens (same as before)
def count_tokens(text, model="gpt-4o-mini"):
    """Counts the number of tokens in a text using tiktoken."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

# Function to chunk text (same as before, maybe adjust max_tokens for gpt-4o-mini)
def chunk_text(text, max_tokens=120000, model="gpt-4o-mini"): # Increased max_tokens for gpt-4o-mini
    """Chunks text into smaller pieces based on token count."""
    chunks = []
    current_chunk = ""
    paragraphs = text.split('\n\n')

    for paragraph in paragraphs:
        if count_tokens(current_chunk + "\n\n" + paragraph, model) < max_tokens - 500:
            current_chunk += "\n\n" + paragraph
        else:
            chunks.append(current_chunk.strip())
            current_chunk = paragraph

    if current_chunk.strip():
        chunks.append(current_chunk.strip())

    return chunks

# Function to scrape Wikipedia (same as before)
def scrape_wikipedia_protests(url):
    """
    Scrapes the content of a Wikipedia page and returns the text.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        content_div = soup.find(id="mw-content-text")
        if content_div:
            text = content_div.get_text()
            return text
        else:
            return None

    except requests.exceptions.RequestException as e:
        print(f"Error during scraping: {e}")
        return None

# Function to extract protest info with OpenAI (same as before)
def extract_protest_info_with_openai(text):
    """
    Uses the OpenAI API to extract protest locations, dates, and the source text
    from text using the new API structure (>= 1.0.0).
    Returns the result in JSON format.
    """
    if not text:
        return None

    prompt = f"""
Extract the locations and dates of the student protests mentioned in the following text.
For each protest, also include the exact paragraph of text from the source document where you found the date and location information.
Return the information in a JSON format with a list of dictionaries. Each dictionary should have the following keys:
"location": The location of the protest.
"date": The date of the protest.
"source_text": The paragraph from the original text where this information was found.

Text:
{text}
"""

    try:
        client = openai.OpenAI(api_key=openai.api_key)

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that extracts information from text."},
                {"role": "user", "content": prompt}
            ],
            response_format={ "type": "json_object" }
        )

        json_string = response.choices[0].message.content
        protest_data = json.loads(json_string)
        print("Extracted protest data:")
        print(json.dumps(protest_data, indent=4))
        return protest_data

    except Exception as e:
        print(f"Error during OpenAI API call: {e}")
        return None


# Function to geocode a location using a direct API call
def geocode_location(location_name, api_key):
    """
    Uses Google's Geocoding API (direct HTTP request) to get the latitude and longitude of a location.
    Returns a dictionary with 'latitude' and 'longitude', or None if geocoding fails.
    """
    geocode_url = "https://maps.googleapis.com/maps/api/geocode/json"

    params = {
        "address": location_name,
        "key": api_key
    }

    try:
        response = requests.get(geocode_url, params=params)
        response.raise_for_status()  # Raise an exception for bad status codes

        geocode_result = response.json()

        if geocode_result["status"] == "OK":
            latitude = geocode_result["results"][0]["geometry"]["location"]["lat"]
            longitude = geocode_result["results"][0]["geometry"]["location"]["lng"]
            return {"latitude": latitude, "longitude": longitude}
        else:
            print(f"Could not geocode location: {location_name}. Status: {geocode_result['status']}")
            if "error_message" in geocode_result:
                print(f"Error message: {geocode_result['error_message']}")
            return None

    except requests.exceptions.RequestException as e:
        print(f"Error during geocoding API call for '{location_name}': {e}")
        return None
    except KeyError as e:
        print(f"Error parsing geocoding response for '{location_name}': Missing key {e}")
        return None

# Main part of the script
wikipedia_url = "https://en.wikipedia.org/wiki/2020%E2%80%932021_Thai_protests"
article_text = scrape_wikipedia_protests(wikipedia_url)

if article_text:
    text_chunks = chunk_text(article_text)
    all_protest_data = []

    print(f"Processing {len(text_chunks)} chunks...")

    for i, chunk in enumerate(text_chunks):
        print(f"Processing chunk {i+1}/{len(text_chunks)}...")
        protest_data_chunk = extract_protest_info_with_openai(chunk)
        if protest_data_chunk and isinstance(protest_data_chunk, dict) and "protests" in protest_data_chunk and isinstance(protest_data_chunk["protests"], list):
             all_protest_data.extend(protest_data_chunk["protests"])
        elif protest_data_chunk and isinstance(protest_data_chunk, list):
             all_protest_data.extend(protest_data_chunk)
        else:
             print(f"Could not extract valid protest data from chunk {i+1}")

    # Now, geocode the locations in the extracted data
    protest_data_with_coords = []
    print("\nGeocoding locations...")

    for protest_event in all_protest_data:
        if "location" in protest_event and protest_event["location"]:
            location_name = protest_event["location"]
            # Pass the GOOGLE_API_KEY to the geocode_location function
            coordinates = geocode_location(location_name, GOOGLE_API_KEY)
            if coordinates:
                protest_event["latitude"] = coordinates["latitude"]
                protest_event["longitude"] = coordinates["longitude"]
            # Add the event to the new list regardless of successful geocoding
            protest_data_with_coords.append(protest_event)
        else:
            # Add the event even if there's no location found by the LLM
            protest_data_with_coords.append(protest_event)


    if protest_data_with_coords:
        print("\nCombined Protest Data with Coordinates:")
        print(json.dumps({"protests": protest_data_with_coords}, indent=4))
    else:
        print("No protest data with coordinates extracted.")

else:
    print("Could not scrape the Wikipedia article.")


Processing 1 chunks...
Processing chunk 1/1...
Extracted protest data:
{
    "protests": [
        {
            "location": "Democracy Monument, Bangkok",
            "date": "18 July 2020",
            "source_text": "On 18 July 2020, Thailand saw the largest street demonstration since the 2014 Thai coup d'\u00e9tat at the Democracy Monument in Bangkok with around 2,500 protesters."
        },
        {
            "location": "Democracy Monument, Bangkok",
            "date": "16 August 2020",
            "source_text": "On 16 August, a large gathering which around 20,000\u201325,000 people joined was held at the Democracy Monument and reiterated calls for a revised constitution and reforms to the monarchy."
        },
        {
            "location": "Thammasat University, Bangkok",
            "date": "19 September 2020",
            "source_text": "In a rally described as one of the largest protests in years, on 19 September, protesters gathered at Thammasat University, then mov