# CS 584 Final Project 
# DETECTING DISCUSSION TOPICS AND SENTIMENT IN REDDIT THREADS <br>


#### Name: Uros Nikolic and Sam Preston
#### Stevens ID: 20017063,  

In [2]:
# Library imports

%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pa
import torch
import torch.nn as nn 
import sys
import os
import random
import evaluate
from torchinfo import summary
from typing import List, Tuple, Union, Dict
from datasets import load_dataset
from tokenizers import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

In [3]:
def print_line(*args):
    """ Inline print and go to the begining of line
    """
    args1 = [str(arg) for arg in args]
    str_ = ' '.join(args1)
    print('\r' + str_, end='')

In [4]:
# Important pip commands for installing some of the neccesasry libraries. 

#!pip install torch
#!pip uninstall torch torchvision torchaudio

#!pip install -U datasets

#!pip install tokenizers

#!pip install torchinfo

#!pip install evaluate
#!pip install sacrebleu

In [5]:
# Testing the Cuda cores on personal machine
print(torch.cuda.is_available())           
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 4070 Laptop GPU


In [6]:
# Data extratcion

controversialPosts = pa.read_csv('data/controversial_posts.csv')
controversialPostComments = pa.read_csv('data/controversial_posts_comments.csv')
hotPost = pa.read_csv('data/hot_posts.csv')
hotPostComments = pa.read_csv('data/hot_posts_comments.csv')
newPost = pa.read_csv('data/new_posts.csv')
newPostComments = pa.read_csv('data/new_posts_comments.csv')
topPost = pa.read_csv('data/top_posts.csv')
topPostComments = pa.read_csv('data/top_posts_comments.csv')


posts_df = pa.concat([controversial, hot, new, top], ignore_index=True)

posts_df = posts_df[['title', 'body', 'subreddit']].fillna('')
posts_df['text'] = posts_df['title'] + ' ' + posts_df['body']
posts_df = posts_df[['text', 'subreddit']]

posts_df = posts_df[posts_df['subreddit'].notna()]
posts_df = posts_df[posts_df['text'].str.strip() != '']
print(posts_df['subreddit'].value_counts().head())


subreddit
RussiaUkraineWar2022    3792
Name: count, dtype: int64


In [7]:
# Code Body

X = posts_df['text']
y = posts_df['subreddit'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [8]:
# Logistic Regression for multi-class topic classification

cv = CountVectorizer(max_features=5_000, stop_words='english')
X_train_counts = cv.fit_transform(X_train)
X_test_counts  = cv.transform(X_test)

K = 5
lda = LatentDirichletAllocation(n_components=K, random_state=42)
lda.fit(X_train_counts)

train_topic_dist = lda.transform(X_train_counts)
test_topic_dist  = lda.transform(X_test_counts)

y_train_topics = train_topic_dist.argmax(axis=1)
y_test_topics  = test_topic_dist.argmax(axis=1)

print("Derived topics in train:", np.unique(y_train_topics))

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train_topics)
y_pred_topics = clf.predict(X_test_tfidf)


print(classification_report(y_test_topics, y_pred_topics))

Derived topics in train: [0 1 2 3 4]
              precision    recall  f1-score   support

           0       0.70      0.88      0.78       208
           1       0.82      0.55      0.66       109
           2       0.83      0.93      0.88       302
           3       0.88      0.69      0.77        84
           4       0.96      0.39      0.56        56

    accuracy                           0.80       759
   macro avg       0.84      0.69      0.73       759
weighted avg       0.81      0.80      0.79       759



In [9]:
# Results



In [10]:
# Final conclusion