# CS 584 Final Project 
# DETECTING DISCUSSION TOPICS AND SENTIMENT IN REDDIT THREADS <br>


#### Name: Uros Nikolic and Sam Preston
#### Stevens ID: 20017063,  

In [None]:
# Library imports

%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pa
import torch
import torch.nn as nn 
import sys
import os
import random
import evaluate
from torchinfo import summary
from typing import List, Tuple, Union, Dict
from datasets import load_dataset
from tokenizers import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

In [None]:
def print_line(*args):
    """ Inline print and go to the begining of line
    """
    args1 = [str(arg) for arg in args]
    str_ = ' '.join(args1)
    print('\r' + str_, end='')

In [None]:
# Important pip commands for installing some of the neccesasry libraries. 

#!pip install torch
#!pip uninstall torch torchvision torchaudio

#!pip install -U datasets

#!pip install tokenizers

#!pip install torchinfo

#!pip install evaluate
#!pip install sacrebleu

In [None]:
# Testing the Cuda cores on personal machine
print(torch.cuda.is_available())           
print(torch.cuda.get_device_name(0))

In [None]:
# Data extratcion

controversialPosts = pa.read_csv('data/controversial_posts.csv')
controversialPostComments = pa.read_csv('data/controversial_posts_comments.csv')
hotPost = pa.read_csv('data/hot_posts.csv')
hotPostComments = pa.read_csv('data/hot_post_comments.csv')
newPost = pa.read_csv('data/new_posts.csv')
newPostComments = pa.read_csv('data/new_post_comments.csv')
topPost = pa.read_csv('data/top_posts.csv')
topPostComments = pa.read_csv('data/top_posts_comments.csv')


posts_df = pa.concat([controversial, hot, new, top], ignore_index=True)

posts_df = posts_df[['title', 'body', 'subreddit']].fillna('')
posts_df['text'] = posts_df['title'] + ' ' + posts_df['body']
posts_df = posts_df[['text', 'subreddit']]

posts_df = posts_df[posts_df['subreddit'].notna()]
posts_df = posts_df[posts_df['text'].str.strip() != '']
print(posts_df['subreddit'].value_counts().head())


In [None]:
# Code Body

X = posts_df['text']
y = posts_df['subreddit'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [None]:
# Logistic Regression for multi-class topic classification

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "NaiveBayes":         MultinomialNB(),
    "LinearSVM":          LinearSVC(max_iter=2000, random_state=42),
    "RandomForest":       RandomForestClassifier(n_estimators=100, random_state=42),
    "MLP":                MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
}

results = []

for name, clf in models.items():
    print(f"\n>>> Training {name}...")
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)
    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec  = recall_score(y_test, y_pred)
    f1   = f1_score(y_test, y_pred)
    print(f"{name} — Acc: {acc:.3f}, Prec: {prec:.3f}, Rec: {rec:.3f}, F1: {f1:.3f}")
    results.append((name, acc, prec, rec, f1))

# aggregate into df
res_df = pd.DataFrame(results, columns=["Model","Accuracy","Precision","Recall","F1"])
print("\nSummary:\n", res_df)

# detailed report for best model
best = res_df.sort_values("F1", ascending=False).iloc[0]["Model"]
print(f"\nDetailed classification report for {best}:\n")
print(classification_report(y_test, models[best].predict(X_test_tfidf)))

In [None]:
# Results



In [None]:
# Final conclusion