# Subreddit Classification

## Import necessary libraries and initialize Reddit object with Reddit developer credentials

In [None]:
import pandas as pd
import praw

# Enter your credentials here
reddit = praw.Reddit(
    client_id = "my-client-id",
    client_secret = "my-client-secret",
    password = "my-password",
    user_agent = "my-user-agent",
    username = "my-username",
)

## Load datasets saved from get_active_users_data.ipynb 

In [None]:
# Change subreddit name as needed
sub = "sgexams"

post_df = pd.read_csv("Datasets/Cleaned"+ sub + "_posters_activity.csv")
comment_df = pd.read_csv("Datasets/Cleaned"+ sub + "_commenters_activity.csv")

In [None]:
subreddits = set([post_df['subreddit'].unique()] + [comment_df['subreddit'].unique()])

In [None]:
# Change subreddit name as needed
sub = "teenagers"

post_df = pd.read_csv("Datasets/Cleaned"+ sub + "_posters_activity.csv")
comment_df = pd.read_csv("Datasets/Cleaned"+ sub + "_commenters_activity.csv")

In [None]:
temp = set([post_df['subreddit'].unique()] + [comment_df['subreddit'].unique()])
subreddits = list(subreddits.union(temp))

## Get subreddit descriptions and use PRAW over18 attribute to identify NSFW subreddits

In [None]:
descriptions = []
topics = []

for sub in subreddits:
    s = reddit.subreddit(sub)
    descriptions.append(s.public_description)
    
    try:
        if s.over18:
            topics.append('NSFW')
        else: 
            topics.append('?') # use ? to indicate the subreddit requires classification
    except:
        topics.append('?')

In [None]:
subs = pd.DataFrame()
subs['subreddit'] = subreddits
subs['topic'] = topics
subs['description'] = descriptions

## Save to csv so manual classification can be done in Excel
Remember to leave some of them as '?' for the model to classify :)

In [None]:
subs.to_csv('subreddits.csv', index = False) 

## Load the dataframe back after classifying some subreddits   
The original classifications are available in the native repository as manual_subreddits.csv

In [None]:
subs = pd.read_csv('subreddits.csv') 

## Concatenate subreddit name to description
This was added after reflecting on the research process 

In [None]:
sub['description'] = sub['subreddit'] + ' ' + sub['description']

## Seperate the manually classified subreddits and the subreddits to be classified by the model

In [None]:
manual = subs.loc[(subs['topic'] != 'NSFW') | (subs['topic'] != '?')]
manual = manual.dropna()
manual = manual[manual['description'].str.len()>50]

nsfw = subs[subs['topic'] == 'NSFW']
tbc = subs[subs['topic'] == '?']

## Import BERT and embed subreddit descriptions 

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
sentences = list(manual['description'])
embeddings = model.encode(sentences)
train_descriptions = pd.DataFrame(embeddings)

In [None]:
train_descriptions['subreddit'] = list(manual['subreddit'])
train_descriptions['topic'] = list(manual['topic'])

In [None]:
train_descriptions.to_csv('train_descriptions.csv', index = False)

In [None]:
sentences = list(tbc['description'])
embeddings = model.encode(sentences)
goal_descriptions = pd.DataFrame(embeddings)

In [None]:
goal_descriptions['subreddit'] = list(tbc['subreddit'])

In [None]:
goal_descriptions.to_csv('goal_descriptions.csv', index = False)

## Import pycaret and set up classification model environment

In [None]:
from pycaret.classification import *
s = setup(train_descriptions.drop('subreddit', axis =1), target = 'topic')

## Train models and print a comparison grid of performance metrics
The excluded models took very long to run in the first couple of iterations and consistently returned terrible performance

In [None]:
models = compare_models(n_select = 3, exclude = ['gbc', 'lightgbm', 'ada', 'qda'], sort = 'F1')

## Tune the hyperparameters of the 3 best performing models

In [None]:
for i in range(len(models)):
    models[i] = tune_model(models[i])

## View confusion matrices for each tuned model and choose the best one
Linear discriminant analysis (LDA) was chosen for its strong performance across 15 out of 19 classes

In [None]:
evaluate_model(models[0])

In [None]:
evaluate_model(models[1])

In [None]:
evaluate_model(models[2])

## Finalize and save the chosen model

In [None]:
final_model = finalize_model(models[0]) # change index number to your chosen model
save_model(final_model, 'final_lda_description') # change model file name as desired

## Use the finalized model to classify the unclassified subreddits

In [None]:
prediction = predict_model(final_model, data = goal_descriptions.drop(columns = 'subreddit')

In [None]:
tbc['topic'] = list(prediction['prediction_label'])
tbc['confidence'] = list(prediction['prediction_score'])

## Inspect predictions with low confidence scores
Manually classify later if needed

In [None]:
min_confidence = 0.5
tbc[tbc['confidence'] < min_confidence]

## Save the dataframe with all subreddits classified

In [None]:
subs = pd.concat([manual, tbc.drop(columns = 'confidence'), nsfw], axis = 0)
subs.to_csv('subreddits.csv', index = False) 