<a href="https://colab.research.google.com/github/Shirley31415926/API_heatwave/blob/main/08%20Topic%20Modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install bertopic[all] -q
!pip install sentence-transformers -q

In [None]:
# BERTopic input
!pip install bertopic
!pip install plotly
!pip install umap-learn

# 2023 Topic modelling

In [None]:
!pip install bertopic umap-learn plotly nltk sentence-transformers -q


import re
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import plotly.io as pio

pio.renderers.default = "colab"

nltk.download("stopwords")
STOP_WORDS = stopwords.words("english")

###Load & preprocess data
DATA_PATH = "/content/drive/MyDrive/heat_posts2023.jsonl"
posts_df = pd.read_json(DATA_PATH, lines=True)

def clean_text(text: str) -> str:
    """Lowercase, remove URLs and non-alpha characters."""
    text = text.lower()
    text = re.sub(r"https?://\S+", "", text)
    text = re.sub(r"www\.\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    return text

# Apply cleaning and parse dates
posts_df["clean_text"] = posts_df["text"].apply(clean_text)
posts_df["date"]       = pd.to_datetime(posts_df["created_at"]).dt.date

texts = posts_df["clean_text"].tolist()
dates = posts_df["date"].tolist()

###Fit BERTopic model
# Using a smaller min_topic_size so even tiny clusters survive
topic_model = BERTopic(
    embedding_model   = SentenceTransformer("all-MiniLM-L6-v2"),
    vectorizer_model  = CountVectorizer(stop_words=STOP_WORDS, ngram_range=(1,2)),
    language          = "english",
    calculate_probabilities = True,
    verbose           = True,
    min_topic_size    = 2,
)

topics, probs = topic_model.fit_transform(texts)
posts_df["topic"] = topics

###Auto‐generate summaries for each topic
summary_map = {}
for t in topic_model.get_topic_info().Topic:
    if t == -1:
        continue
    top2 = [w for w,_ in topic_model.get_topic(t)[:2]]
    summary_map[t] = "_".join(top2)

posts_df["topic_summary"] = posts_df["topic"].map(lambda t: summary_map.get(t, "Other"))


print("Topic summaries:")
for t, label in summary_map.items():
    print(f"  {t:2d} → {label}")

###“Health / Energy / Policy / Climate” mapping
label_dict = {
    "Health":  ["heatstroke","hospital","dehydration","emergency","risk","dizzy","overheat"],
    "Energy":  ["power","outage","electricity","grid","air conditioning","cooling","blackout"],
    "Policy":  ["government","response","support","aid","fund","relief","governor"],
    "Climate": ["climate","crisis","warming","carbon","emission","anxiety"]
}

def assign_label(keywords, mapping):
    keys = set(w for w,_ in keywords)
    scores = {lbl: len(keys & set(words)) for lbl,words in mapping.items()}
    return max(scores, key=scores.get) if max(scores.values())>0 else "Other"

topic_label_map = {
    t: assign_label(topic_model.get_topic(t), label_dict)
    for t in summary_map.keys()
}
posts_df["semantic_label"] = posts_df["topic"].map(lambda t: topic_label_map.get(t, "Other"))

###Visualize topic‐summary trends over time
time_df = posts_df.groupby(["date","topic_summary"]).size().unstack(fill_value=0)
# pick top 10 summaries by overall counts
top10 = time_df.sum().sort_values(ascending=False).head(50).index

plt.figure(figsize=(14,6))
for lbl in top10:
    plt.plot(time_df.index, time_df[lbl], marker=".", label=lbl)

plt.title("Top-10 BERTopic Summaries Over Time")
plt.xlabel("Date")
plt.ylabel("Post Count")
plt.xticks(rotation=45)
plt.legend(title="Topic Summary", bbox_to_anchor=(1.02,1), loc="upper left")
plt.tight_layout()
plt.show()

###results
posts_df.to_csv("heatwave_posts_with_topics.csv", index=False)
pd.DataFrame(topic_model.topics_over_time(texts, dates)).to_csv("topics_over_time.csv", index=False)


In [None]:
topic_model.visualize_barchart(top_n_topics=35).show()

# 2024 Topic modelling

In [None]:
!pip install bertopic umap-learn plotly nltk sentence-transformers -q

import re
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import plotly.io as pio


pio.renderers.default = "colab"

nltk.download("stopwords")
STOP_WORDS = stopwords.words("english")

###Load data
DATA_PATH = "/content/drive/MyDrive/heat_posts2024_standardized.jsonl"
posts_df = pd.read_json(DATA_PATH, lines=True)

def clean_text(text: str) -> str:
    """Lowercase, remove URLs and non-alpha characters."""
    text = text.lower()
    text = re.sub(r"https?://\S+", "", text)
    text = re.sub(r"www\.\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    return text

posts_df["clean_text"] = posts_df["text"].apply(clean_text)
posts_df["date"]       = pd.to_datetime(posts_df["created_at"]).dt.date

texts = posts_df["clean_text"].tolist()
dates = posts_df["date"].tolist()


### Fit BERTopic model
# Using a smaller min_topic_size so even tiny clusters survive
topic_model = BERTopic(
    embedding_model   = SentenceTransformer("all-MiniLM-L6-v2"),
    vectorizer_model  = CountVectorizer(stop_words=STOP_WORDS, ngram_range=(1,2)),
    language          = "english",
    calculate_probabilities = True,
    verbose           = True,
    min_topic_size    = 2,
)

topics, probs = topic_model.fit_transform(texts)
posts_df["topic"] = topics


#  Auto‐generate summaries for each topic
summary_map = {}
for t in topic_model.get_topic_info().Topic:
    if t == -1:
        continue
    top2 = [w for w,_ in topic_model.get_topic(t)[:2]]
    summary_map[t] = "_".join(top2)

posts_df["topic_summary"] = posts_df["topic"].map(lambda t: summary_map.get(t, "Other"))

print("Topic summaries:")
for t, label in summary_map.items():
    print(f"  {t:2d} → {label}")

# “Health / Energy / Policy / Climate” mapping
label_dict = {
    "Health":  ["heatstroke","hospital","dehydration","emergency","risk","dizzy","overheat"],
    "Energy":  ["power","outage","electricity","grid","air conditioning","cooling","blackout"],
    "Policy":  ["government","response","support","aid","fund","relief","governor"],
    "Climate": ["climate","crisis","warming","carbon","emission","anxiety"]
}

def assign_label(keywords, mapping):
    keys = set(w for w,_ in keywords)
    scores = {lbl: len(keys & set(words)) for lbl,words in mapping.items()}
    return max(scores, key=scores.get) if max(scores.values())>0 else "Other"

topic_label_map = {
    t: assign_label(topic_model.get_topic(t), label_dict)
    for t in summary_map.keys()
}
posts_df["semantic_label"] = posts_df["topic"].map(lambda t: topic_label_map.get(t, "Other"))

### visualize topic‐summary trends over time
# pivot
time_df = posts_df.groupby(["date","topic_summary"]).size().unstack(fill_value=0)
# pick top 10 summaries by overall counts
top10 = time_df.sum().sort_values(ascending=False).head(50).index


plt.figure(figsize=(14,6))
for lbl in top10:
    plt.plot(time_df.index, time_df[lbl],marker=".", label=lbl)

plt.title("Top-10 BERTopic Summaries Over Time")
plt.xlabel("Date")
plt.ylabel("Post Count")
plt.xticks(rotation=45)
plt.legend(title="Topic Summary", bbox_to_anchor=(1.02,1), loc="upper left")
plt.tight_layout()
plt.show()

### results
posts_df.to_csv("heatwave_posts_with_topics.csv", index=False)
pd.DataFrame(topic_model.topics_over_time(texts, dates)).to_csv("topics_over_time.csv", index=False)


In [None]:
topic_model.visualize_barchart(top_n_topics=35).show()