In [None]:
!pip install openai langchain textblob spacy sentence-transformers umap-learn hdbscan pandas matplotlib scikit-learn python-dotenv
!python -m spacy download en_core_web_sm

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import umap
import hdbscan
from dotenv import load_dotenv
from textblob import TextBlob
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
import spacy

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

llm = ChatOpenAI(model="gpt-4o", temperature=0.3, openai_api_key=OPENAI_API_KEY)
nlp = spacy.load("en_core_web_sm")

In [None]:
data = {
    "comment": [
        "The MBTA delays are getting worse every day. No one even gives updates anymore.",
        "Housing prices are insane. The city needs to regulate rent increases.",
        "Healthcare access for immigrants is so complicated. We need multilingual support.",
        "Public parks are great but not safe at night. Lighting and patrols could help.",
        "Our schools need more special education staff, not just technology upgrades."
    ]
}
df = pd.DataFrame(data)
df.head()

In [None]:
def clean_text(text):
    doc = nlp(text.lower())
    tokens = [t.lemma_ for t in doc if not t.is_stop and t.is_alpha]
    return " ".join(tokens)

df["clean_text"] = df["comment"].apply(clean_text)
df.head()

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df["clean_text"], show_progress_bar=True)

reducer = umap.UMAP(random_state=42)
embedding_2d = reducer.fit_transform(embeddings)

clusterer = hdbscan.HDBSCAN(min_cluster_size=2, gen_min_span_tree=True)
clusters = clusterer.fit_predict(embeddings)
df["cluster"] = clusters
df.head()

In [None]:
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

def get_emotion_label(score):
    if score > 0.2:
        return "Hopeful"
    elif score < -0.2:
        return "Frustrated"
    else:
        return "Neutral"

df["sentiment_score"] = df["comment"].apply(get_sentiment)
df["dominant_emotion"] = df["sentiment_score"].apply(get_emotion_label)
df.head()

In [None]:
def generate_insight(topic_texts, cluster_id):
    combined_text = " ".join(topic_texts)
    prompt = f"""
    You are a civic policy analyst AI. Analyze the following public comments and create a brief
    summarizing community sentiment, main issues, and an actionable recommendation.

    Comments:
    {combined_text}

    Return JSON format:
    {{
      "cluster_id": "{cluster_id}",
      "summary": "...",
      "dominant_emotion": "...",
      "top_issues": ["..."],
      "recommended_action": "...",
      "equity_note": "..."
    }}
    """
    resp = llm([HumanMessage(content=prompt)])
    return resp.content

insights = []
for c in df["cluster"].unique():
    subset = df[df["cluster"] == c]["comment"].tolist()
    insight = generate_insight(subset, c)
    insights.append(insight)

In [None]:
for i, text in enumerate(insights):
    print(f"\n=== Civic Cluster {i} ===")
    print(text)

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(embedding_2d[:,0], embedding_2d[:,1], c=df["sentiment_score"], cmap="coolwarm", s=80)
plt.colorbar(label="Sentiment Polarity")
plt.title("Civic Sentiment Map – PulsePolis")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.show()

In [None]:
df.to_csv("pulsepolis_results.csv", index=False)
print("Results saved as pulsepolis_results.csv")

In [None]:
prompt = f"""
You are an AI policy summarizer.
Based on these civic discussions, produce a city-wide report summarizing key concerns,
community emotions, and 3 actionable policy recommendations.

Data:
{df.to_dict(orient='records')}
"""
report = llm([HumanMessage(content=prompt)])
print(report.content)