In [1]:
import pandas as pd
import numpy as nport train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection imp
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
# nltk.download('vader_lexicon')


In [2]:
# Load dataset
df = pd.read_csv("reddit_historical_data.csv")
df.head()

Unnamed: 0,Post_ID,Platform,Hashtag,Content_Type,Likes,Comments,Engagement_Level,Title,Text,Created_At,Year,Month,Quarter,Subreddit,Keyword
0,1jf8ypk,Reddit,,Post,34,2,Low,Musk team targets nearly two dozen environment...,,2025-03-19 21:54:58,2025,3,1,environment,climate change
1,1jf7qrr,Reddit,,Post,82,10,Low,Greenpeace must pay hundreds of millions over ...,,2025-03-19 21:02:52,2025,3,1,environment,climate change
2,1jf74ky,Reddit,,Post,1,1,Low,Jury Finds Greenpeace Liable for Hundreds of M...,,2025-03-19 20:37:25,2025,3,1,environment,climate change
3,1jf6w55,Reddit,,Post,491,43,Medium,Greenpeace must pay hundreds of millions over ...,,2025-03-19 20:27:22,2025,3,1,environment,climate change
4,1jf6m89,Reddit,,Post,138,17,Medium,Jury Finds Greenpeace Liable for Hundreds of M...,,2025-03-19 20:15:55,2025,3,1,environment,climate change


In [3]:
# Fill missing text values by combining with Title
df["Text"] = df["Text"].fillna(df["Title"])

In [4]:
# Sentiment Analysis using VADER
sia = SentimentIntensityAnalyzer()
df["Sentiment_Score"] = df["Text"].apply(lambda x: sia.polarity_scores(x)["compound"])
df["Sentiment"] = df["Sentiment_Score"].apply(lambda x: "Positive" if x > 0.05 else ("Negative" if x < -0.05 else "Neutral"))


In [5]:
# Label Encoding categorical variables
label_encoders = {}
for col in ["Content_Type", "Subreddit", "Keyword", "Sentiment"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [6]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=500, stop_words='english', ngram_range=(1,2))
text_tfidf = tfidf.fit_transform(df["Text"])

In [7]:
    # Topic Modeling using LDA
    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    topic_features = lda.fit_transform(text_tfidf)

df["Engagement_Level"] = LabelEncoder().fit_transform(df["Engagement_Level"])


In [8]:
# Feature selection
X = df[["Likes", "Comments", "Content_Type", "Subreddit", "Keyword"]]
X = np.hstack([X, text_tfidf.toarray(), topic_features])
y_engagement = df["Engagement_Level"]
y_sentiment = df["Sentiment"]


In [9]:
# Split Data
X_train, X_test, y_train_engagement, y_test_engagement, y_train_sentiment, y_test_sentiment = train_test_split(
    X, y_engagement, y_sentiment, test_size=0.2, random_state=42, stratify=y_engagement)

In [10]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [11]:
# PCA for dimensionality reduction
pca = PCA(n_components=100)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [12]:
# Train Model for Engagement Prediction
rf_engagement = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
rf_engagement.fit(X_train_pca, y_train_engagement)


In [13]:
y_pred_engagement = rf_engagement.predict(X_test_pca)
print("Engagement Model Accuracy:", accuracy_score(y_test_engagement, y_pred_engagement))
print(classification_report(y_test_engagement, y_pred_engagement))

Engagement Model Accuracy: 0.9897629310344828
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       616
           1       0.99      0.99      0.99      2052
           2       0.99      0.98      0.98      1044

    accuracy                           0.99      3712
   macro avg       0.99      0.99      0.99      3712
weighted avg       0.99      0.99      0.99      3712



In [14]:
# Train Model for Sentiment Prediction
rf_sentiment = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
rf_sentiment.fit(X_train_pca, y_train_sentiment)


In [15]:
y_pred_sentiment = rf_sentiment.predict(X_test_pca)
print("Sentiment Model Accuracy:", accuracy_score(y_test_sentiment, y_pred_sentiment))
print(classification_report(y_test_sentiment, y_pred_sentiment))

Sentiment Model Accuracy: 0.9771012931034483
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1394
           1       0.95      0.97      0.96      1012
           2       0.98      0.97      0.98      1306

    accuracy                           0.98      3712
   macro avg       0.97      0.98      0.98      3712
weighted avg       0.98      0.98      0.98      3712



In [16]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download VADER if not already downloaded
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [17]:
# Load dataset
file_path = "reddit_historical_data.csv"  # Update this path if needed
df = pd.read_csv(file_path)


In [18]:
# Initialize Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

In [19]:
# Apply VADER sentiment analysis on the "Title" column (assuming it contains text data)
df["sentiment_score"] = df["Title"].astype(str).apply(lambda text: sia.polarity_scores(text)["compound"])


In [20]:
#Classify sentiment based on compound score
def classify_sentiment(score):
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"
df["Sentiment"] = df["sentiment_score"].apply(classify_sentiment)

In [21]:
# Print final sentiment summary
sentiment_counts = df["Sentiment"].value_counts()
print("Sentiment Analysis Results:")
print(sentiment_counts)


Sentiment Analysis Results:
Sentiment
Negative    6930
Positive    5865
Neutral     5765
Name: count, dtype: int64


In [22]:

# Determine overall sentiment based on majority count
final_sentiment = sentiment_counts.idxmax()
print("\nFinal Sentiment Outcome:", final_sentiment)



Final Sentiment Outcome: Negative
