In [None]:
!pip install scikit-learn

In [None]:
import pandas as pd
import string
import re
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB  # or use SVM if preferred
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import nltk
nltk.download("punkt")
nltk.download("stopwords")

# Load comments
df = pd.read_csv("comments_data.csv")
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["week"] = df["timestamp"].dt.isocalendar().week

# --- Step 1: Preprocess text ---
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|[^a-zA-Z\s]", '', text)  # remove URLs, punctuation
    tokens = word_tokenize(text)
    return ' '.join([word for word in tokens if word not in stop_words])

df["cleaned_text"] = df["comment_text"].apply(clean_text)

# --- Step 2: Use TextBlob for pseudo-labeling ---
def get_sentiment(text):
    return "positive" if TextBlob(text).sentiment.polarity > 0 else "negative"

df["sentiment"] = df["comment_text"].apply(get_sentiment)

# --- Step 3: Vectorize using TF-IDF ---
X = df["cleaned_text"]
y = df["sentiment"]

vectorizer = TfidfVectorizer(max_features=5000)
X_vect = vectorizer.fit_transform(X)

# --- Step 4: Train Classifier ---
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

# Choose one:
model = MultinomialNB()
# from sklearn.svm import SVC; model = SVC(kernel="linear")

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))

# --- Step 5: Predict all comments ---
df["predicted_sentiment"] = model.predict(vectorizer.transform(df["cleaned_text"]))

# --- Step 6: Weekly sentiment trend per channel ---
trend = df.groupby(["channel_id", "week", "predicted_sentiment"]).size().unstack(fill_value=0)
trend["total"] = trend.sum(axis=1)
trend["positive_pct"] = (trend.get("positive", 0) / trend["total"]) * 100
trend["negative_pct"] = (trend.get("negative", 0) / trend["total"]) * 100

# Save to CSV
trend.reset_index().to_csv("weekly_sentiment_trend.csv", index=False)
print("✅ Saved to weekly_sentiment_trend.csv")
