In [None]:
import os
import re
import pandas as pd
import gensim
import gensim.corpora as corpora
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from collections import Counter

# Stopwords and character names to filter out
STOPWORDS = set(["the", "to", "and", "of", "in", "his", "it", "is", "you", "her", "she", "he", "i", "me", "my", "we", "us"])
CHARACTER_NAMES = set(["tony", "steve", "natasha", "loki", "thor", "fury", "banner", "hulk", "stark", "barton", "wanda", "clint", "peter", "ganush", "ellen", "jack", "rham", "jas", "shaun", "milo", "trudy", "phil"])

# Manually define topic labels (first 5: Avengers, last 5: Drag Me to Hell)
TOPIC_LABELS = [
    "Superpowers & Science", "Combat & Military", "Leadership & Strategy", "Technology & Weapons", "Tactical Planning",
    "Supernatural & Curses", "Possession & Haunting", "Fear & Suspense", "Psychological Horror", "Demonic Entities"
]

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"https?://\S+|www\.\S+", "", text)  # Remove URLs
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\b\d+\b", "", text)  # Remove standalone numbers
    tokens = text.split()
    tokens = [word for word in tokens if word not in STOPWORDS and word not in CHARACTER_NAMES]
    return tokens

# Load dataset
def load_documents(folder):
    documents = []
    for file in sorted(os.listdir(folder)):
        with open(os.path.join(folder, file), "r", encoding="utf-8") as f:
            documents.append(f.read())
    return documents

# Get top 10 frequent words per category
def get_top_words(docs):
    word_counts = Counter()
    for doc in docs:
        word_counts.update(preprocess_text(doc))
    return [word for word, _ in word_counts.most_common(10)]

# Run LDA
def run_lda(docs, num_topics=10):
    tokenized_docs = [preprocess_text(doc) for doc in docs]
    dictionary = corpora.Dictionary(tokenized_docs)
    dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=10000)
    corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
    lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15, random_state=42)
    return lda_model, dictionary, corpus

# Extract topic words per category
def get_topic_words(lda_model):
    topics = lda_model.show_topics(num_topics=-1, num_words=10, formatted=False)
    avengers_topics = {TOPIC_LABELS[i]: [word for word, _ in words] for i, (_, words) in enumerate(topics[:5])}
    horror_topics = {TOPIC_LABELS[i + 5]: [word for word, _ in words] for i, (_, words) in enumerate(topics[5:])}
    return avengers_topics, horror_topics

# Main execution
if __name__ == "__main__":
    # Load documents
    avengers_docs = load_documents("output/avengers")
    horror_docs = load_documents("output/drag_me_to_hell")
    
    # Compute top words
    avengers_top_words = get_top_words(avengers_docs)
    horror_top_words = get_top_words(horror_docs)
    
    print("\nTop 10 Words for Avengers:\n", avengers_top_words)
    print("\nTop 10 Words for Horror:\n", horror_top_words)

    # Run LDA
    all_docs = avengers_docs + horror_docs
    lda_model, dictionary, corpus = run_lda(all_docs, num_topics=10)

    # Extract topic words per category
    avengers_topics, horror_topics = get_topic_words(lda_model)

    print("\nAvengers Topic Words:\n", avengers_topics)
    print("\nHorror Topic Words:\n", horror_topics)
