In [2]:
pip install openai -q

You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install python-dotenv -q

You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import json
import pandas as pd
import openai
import time
from dotenv import load_dotenv
from pathlib import Path

In [5]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [6]:
INPUT_PATH = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/BERTopicResult/BERTopic_final_result.json"
OUTPUT_DIR = Path("/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/llm_subtopic")

In [7]:
PROMPT_TEMPLATE = """
You are a helpful assistant that classifies text into one or more climate-related categories.

Use the examples below to guide your classification:

- Activism: "Thousands marched in the climate strike demanding government action.", "Youth-led campaigns are pressuring lawmakers to enact stronger climate policies.", "Environmental activists chained themselves to construction equipment to protest deforestation."
- Agriculture: "Crops are failing due to prolonged droughts intensified by climate change.", "Pollinators like bees are disappearing, threatening global food security.", "Rising temperatures are affecting livestock health and reducing dairy production on farms."
- Disaster: "Wildfires have devastated thousands of acres in California.", "Flooding from intense storms has displaced hundreds of families across the region.", "A powerful hurricane made landfall, causing widespread destruction and power outages."
- Fossil: "Oil prices continue to rise amid geopolitical tensions and supply constraints.", "New coal-fired power plants are being built despite international climate agreements.", "Natural gas usage has surged as countries transition away from coal and nuclear energy."
- Lifestyle: "People are embracing minimalist living to reduce their carbon footprint.", "Plant-based diets are gaining popularity for their environmental benefits.", "Air pollution is linked to rising asthma rates in urban areas."
- Politics: "The new administration reversed several environmental protections.", "Communities of color are disproportionately affected by environmental hazards.", "Lawmakers are debating a new bill aimed at cutting national carbon emissions by 2030."
- Renewable: "Government subsidies are making rooftop solar panels more accessible.", "Community wind projects are helping rural areas become energy independent.", "Drought conditions are affecting the electricity output of hydropower plants in Brazil."
- Waste: "Cities are expanding composting and recycling programs to reduce landfill use.", "Single-use plastics are being banned in several countries to combat environmental pollution.", "Innovative startups are turning food waste into sustainable packaging materials."
- Weather: "Global temperatures hit a new record high this year.", "An unprecedented heatwave swept across Europe, breaking temperature records.", "Heavy rainfall and flash floods have disrupted transportation in the region."
- Nature: "Biodiversity is very important for a healthy ecosystem and we should be looking after wildlife.", "Mass extinction of plants and animals is a real danger we have to consider", "Trees are magnificent creatures and I believe are a key element in combating climate change."
- Nuclear: "Nuclear energy is key for our future as we transition to low-carbon power sources.", "Debates continue over the safety and waste management of nuclear power plants.", "Several countries are investing in next-generation nuclear reactors to meet climate goals."
- Electricity: "Electricity demand is expected to surge with the rise of electric vehicles and heat pumps.", "Power outages are becoming more frequent due to aging electrical grids and extreme weather.", "Renewables now supply a growing share of global electricity production."
- Construction: "Green construction practices are reducing the carbon footprint of new buildings.", "Urban expansion is driving increased demand for sustainable construction materials.", "The construction industry faces pressure to cut emissions and improve energy efficiency."
- Transportation: "Public transportation systems are expanding to reduce urban congestion and pollution.", "Electric vehicles are transforming the future of transportation infrastructure.", "Transportation remains a major source of greenhouse gas emissions globally."


If the text belongs to more than one category, list all relevant categories separated by commas.
If the text does not fit any category above, assign a new category name.

Classify the following text:
"{text}"

Response:
"""

In [8]:
def calculate_n(df, max_n=20):
    N = len(df)
    Z = 1.96
    p = 0.5
    E = 0.05
    n = round((N * Z**2 * p * (1 - p)) / ((E**2 * (N - 1)) + (Z**2 * p * (1 - p))))
    return min(n, max_n)

In [9]:
df = pd.read_json(INPUT_PATH, lines=True)
df = df.rename(columns={"topic": "cluster"}) 

In [10]:
def run_classification(df):
    for cluster_id in df.cluster.unique():
        df_cl = df[df.cluster == cluster_id]
        n = calculate_n(df_cl)
        df_samp = df_cl.sample(n=n, random_state=42)

        topic_data = {
            "topic": int(cluster_id),
            "samples": []
        }

        for idx, row in df_samp.iterrows():
            prompt = PROMPT_TEMPLATE.format(text=row["text"])

            try:
                response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.0,
                    max_tokens=50
                )
                label = response["choices"][0]["message"]["content"].strip()
            except Exception as e:
                label = f"ERROR: {str(e)}"

            topic_data["samples"].append({
                "seq": row["seq"],
                "cid": row.get("cid"),
                "text": row["text"],
                "gpt_label": label
            })

            if idx % 5 == 0:
                print(f"[Cluster {cluster_id}] Processed {idx + 1}/{n} samples")

        topic_dir = OUTPUT_DIR
        topic_dir.mkdir(parents=True, exist_ok=True)
        topic_file = topic_dir / f"topic_{str(cluster_id).zfill(4)}.json"
        with open(topic_file, "w") as f:
            json.dump(topic_data, f, indent=2)

In [12]:
#Run 

run_classification(df)



Total unique topics: 98
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
