In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
import re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from wordcloud import WordCloud

In [None]:
model = joblib.load("outputs/svm_model.joblib")
tfidf = joblib.load("outputs/tfidf_vectorizer.joblib")
svd = joblib.load("outputs/svd_transformer.joblib")

In [None]:
df = pd.read_csv("data/new_unseen_comments.csv")
df = df.dropna(subset=['short_comment'])

def preprocess_text(text):
    text = re.sub(r'[^A-Za-z\s]', '', str(text)).lower()
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

df['cleaned_comment'] = df['short_comment'].apply(preprocess_text)

In [None]:
X_tfidf = tfidf.transform(df['cleaned_comment'])
X_reduced = svd.transform(X_tfidf)

In [None]:
predicted_labels = model.predict(X_reduced)
confidence_scores = model.predict_proba(X_reduced)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X_reduced)

df['predicted_label'] = predicted_labels
df['confidence_score'] = confidence_scores

In [None]:
df.to_csv("outputs/predicted_unseen_data.csv", index=False)

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='predicted_label')
plt.title("Predicted Sentiment Distribution")
plt.xlabel("Predicted Label (1 = Support, 0 = Oppose)")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
supportive_text = ' '.join(df[df['predicted_label'] == 1]['short_comment'].dropna().astype(str))
wordcloud_pos = WordCloud(width=800, height=400, background_color='white').generate(supportive_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_pos, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud - Predicted Supportive Comments")
plt.show()

In [None]:
nonsupportive_text = ' '.join(df[df['predicted_label'] == 0]['short_comment'].dropna().astype(str))
wordcloud_neg = WordCloud(width=800, height=400, background_color='white').generate(nonsupportive_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_neg, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud - Predicted Non-Supportive Comments")
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.histplot(df['confidence_score'], bins=30, kde=True)
plt.title("Prediction Confidence Distribution")
plt.xlabel("Confidence Score")
plt.tight_layout()
plt.show()