In [64]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# nltk.download('stopwords')
# nltk.download('wordnet')

# 1. Load your dataset (assumes a CSV with a 'text' column)
df = pd.read_csv('../data/merged_data.csv')
texts = df['Tweet'].dropna().tolist()

In [65]:
# For demonstration, replace the above with your actual path.

def preprocess(text, lemmatizer, stop_words):
    # Lowercase
    text = text.lower()
    # Remove URLs, mentions, hashtags
    text = re.sub(r"http\S+|www\.\S+", "", text)
    text = re.sub(r"@\w+|#", "", text)
    # Remove non-alphabetic characters
    text = re.sub(r"[^a-z\s]", "", text)
    # Tokenize
    tokens = text.split()
    # Remove stopwords and short tokens
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    # Lemmatize
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

base_stops = stopwords.words('english')
custom_stops = ['climate', 'change', 'amp', 'rt', 'tweet']
base_stops.extend(custom_stops)
lemmatizer = WordNetLemmatizer()

# Preprocess all tweets
processed_texts = [preprocess(t, lemmatizer, base_stops) for t in texts]

# 2. Vectorize
vectorizer = CountVectorizer(max_df=0.95, min_df=10, max_features=1500, ngram_range=(1,2))
dt_matrix = vectorizer.fit_transform(processed_texts)

# 3. Train LDA
n_topics = 3  # at least 3 topics
lda = LatentDirichletAllocation(n_components=n_topics,
                                max_iter=32,
                                learning_method='online',
                                random_state=42,
                                doc_topic_prior=0.1,   # lower alpha for more distinct topics
                                topic_word_prior=0.01  # lower beta for sharper word distributions
                            )
lda.fit(dt_matrix)

# 4. Display topics
def display_topics(model, feature_names, no_top_words=3):
    for idx, topic in enumerate(model.components_):
        # get feature indices sorted by weight (highest first)
        sorted_indices = topic.argsort()[:-no_top_words*2 - 1:-1]

        unique_terms = []
        for i in sorted_indices:
            term = feature_names[i]
            if term not in unique_terms:
                unique_terms.append(term)
            if len(unique_terms) == no_top_words:
                break

        print(f"Topic {idx + 1}: {' '.join(unique_terms)}")

tf_feature_names = vectorizer.get_feature_names_out()
display_topics(lda, tf_feature_names, 3)

# Replace comments and demo placeholders with your actual data loading and processed_texts list

# Tips:
# - Tune max_df/min_df to filter out very common or rare words.
# - Experiment with n_components >= 3.
# - Increase max_iter for better convergence.
# - Consider using gensim for distributed or larger corpora.

Topic 1: level sea rising
Topic 2: crisis global environmental
Topic 3: carbon footprint carbon footprint
