In [None]:
import pandas as pd
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import nltk

# Download NLTK resources (run once)
nltk.download('stopwords')
nltk.download('punkt')

# Load your comments data
df = pd.read_csv("comments_data_updated.csv")

stop_words = set(stopwords.words("english"))

# Preprocess comment text
def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    return [w for w in tokens if w.isalpha() and w not in stop_words]

df["tokens"] = df["comment_text"].apply(preprocess)

# Count most common words by sentiment
positive_words = df[df["sentiment"] == "positive"]["tokens"].explode()
negative_words = df[df["sentiment"] == "negative"]["tokens"].explode()

pos_freq = Counter(positive_words)
neg_freq = Counter(negative_words)

# Create final DataFrame
word_data = []

for word, count in pos_freq.most_common(100):
    word_data.append({"word": word, "frequency": count, "sentiment": "positive"})

for word, count in neg_freq.most_common(100):
    word_data.append({"word": word, "frequency": count, "sentiment": "negative"})

word_df = pd.DataFrame(word_data)
word_df.to_csv("sentiment_wordcloud.csv", index=False)
print("✅ Saved as sentiment_wordcloud.csv")