In [1]:
#Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt

import torch
import transformers
import praw
import warnings

from pmaw import PushshiftAPI

warnings.filterwarnings('ignore')

In [2]:
#Scraping Reddit
reddit = praw.Reddit(client_id='BIYwp3HhdUEh9_uOUVr0Tg', client_secret='JXTjLPONmrctnE6CCx5Z9BjYuaSXkw', user_agent='wsbScrape')

#Collection of Initial DataSet
posts = []
wsbData = reddit.subreddit('WallStreetBets')

#This fails around 100 posts. Not robust enough for large data collection.
for post in wsbData.top(limit=10):
    posts.append([post.title, post.score, post.id, post.url, post.num_comments, post.selftext, post.created])
   
dataSet = pd.DataFrame(posts,columns=['title', 'score', 'id', 'url', 'num_comments', 'body', 'created'])
dataSet.head()

Unnamed: 0,title,score,id,url,num_comments,body,created
0,Times Square right now,467860,l8rf4k,https://v.redd.it/x64z70f7eie61,13702,,1612030000.0
1,UPVOTE so everyone sees we got SUPPORT,331527,l6wu59,https://i.redd.it/sgoqy8nyt2e61.png,12932,,1611841000.0
2,GME YOLO update — Jan 28 2021,293629,l78uct,https://i.redd.it/opzucppb15e61.png,23298,,1611868000.0
3,GME YOLO month-end update — Jan 2021,260252,l846a1,https://i.redd.it/r557em3t5ce61.png,20232,,1611954000.0
4,It’s treason then,239236,l881ia,https://i.redd.it/d3t66lv1yce61.jpg,4643,,1611964000.0


In [3]:
#This API works better for large data set collections and is multi threaded.
api = PushshiftAPI()

#Set the range and subreddit for data collection
#In this case 1000000 posts from r/wallstreetbets January 1, 2021 to September 1, 2022.
before = int(dt.datetime(2022,9,1,0,0).timestamp())
after = int(dt.datetime(2021,1,1,0,0).timestamp())
subreddit="wallstreetbets"
limit=1000000

#Function to filter out posts with less than 10 upvotes.
def fxn(item):
  return item['score'] >= 10

#Collect the posts and print total number when done.
#Completed in 5 Hours 34 Minutes.
api_praw = PushshiftAPI(praw=reddit, num_workers = 18)
posts = api_praw.search_submissions(subreddit=subreddit, limit=limit, before=before, after=after, filter_fn=fxn)
print(f'Retrieved {len(posts)} posts from Pushshift')

#Save Example as CSV to inspect output format.
postsDF = pd.DataFrame(posts)
postsDF.to_csv('./wsb_posts.csv', header=True, index=False, columns=list(postsDF.axes[1]))

Retrieved 125053 posts from Pushshift


In [78]:
#I am running this after restarting Jupyter Notebook so I will read from the CSV file now.
postsDF = pd.read_csv('wsb_posts.csv', header=0)

#Convert UTC Epoch to standard date and time.
postsDF['date'] = pd.to_datetime(postsDF['created_utc'], utc=True, unit='s')

#Create new DataFrame only with info that is required to reduce file size.
dataset = postsDF[['title', 'selftext', 'link_flair_css_class','score', 'upvote_ratio', 'num_comments', 'date','permalink']]
#dataset.to_csv('./wsb_dataset.csv', header=True, index=False, columns=list(dataset.axes[1]))

#Show the raw DataFrame.
dataset.head()

Unnamed: 0,title,selftext,link_flair_css_class,score,upvote_ratio,num_comments,date,permalink
0,Yolo loss waiting to picked up at the moon. Am...,,yolo,14,0.83,16,2021-04-09 19:42:24+00:00,/r/wallstreetbets/comments/mnp084/yolo_loss_wa...
1,"So, I didn’t tell the wife I used the equity f...",[deleted],yolo,7768,0.81,1304,2021-04-09 19:39:53+00:00,/r/wallstreetbets/comments/mnoyga/so_i_didnt_t...
2,... Then You Haven't Met The Apes...,,meme,70,0.88,19,2021-04-09 19:38:01+00:00,/r/wallstreetbets/comments/mnox4t/then_you_hav...
3,I know it's a difficult day for GME apes. I`m ...,,meme,40,0.92,13,2021-04-09 19:36:04+00:00,/r/wallstreetbets/comments/mnovrc/i_know_its_a...
4,You see this my fellow apes? ITS RISING KEEP T...,,profit,246,0.96,16,2021-02-03 18:36:23+00:00,/r/wallstreetbets/comments/lbuln3/you_see_this...


In [79]:
profit = dataset[dataset.link_flair_css_class == 'profit']
loss = dataset[dataset.link_flair_css_class == 'loss']

#We have 10869 posts tagged 'profit'.
#These will make up the positive sentiment data set.
#Now to create the positive sentiment data set.
profit = profit[['title','link_flair_css_class']]
profit = profit.rename(columns={'title':'text', 'link_flair_css_class' : 'label'})
profit = profit.assign(label=1)
profit.sample(7500).to_csv('./positive.csv', header=True, index=False, columns=list(profit.axes[1]))

#We have 7924 posts tagged 'loss'.
#These will make up the negative sentiment data set.
loss = loss[['title','link_flair_css_class']]
loss = loss.rename(columns={'title':'text', 'link_flair_css_class' : 'label'})
loss = loss.assign(label=0)
loss.sample(7500).to_csv('./negative.csv', header=True, index=False, columns=list(loss.axes[1]))

#A validation data set containing 500 random posts will be selected from the remaining posts
#I will classify this data myself
test = dataset[dataset.score >=1000]
test = test.drop(test.loc[test['link_flair_css_class']=='profit'].index)
test = test.drop(test.loc[test['link_flair_css_class']=='loss'].index)
test = test.drop(test.loc[test['link_flair_css_class']==''].index)
test = test.drop(test.loc[test['title']=='[deleted by user]'].index)
test = test[['title','link_flair_css_class']]
test = test.rename(columns={'title':'text', 'link_flair_css_class' : 'label'})
test = test.assign(label=1)
test.sample(500).to_csv('./test.csv', header=True, index=False, columns=list(test.axes[1]))

#The Data will now be checked over to ensure proper formating as some post titles cause issues with CSV files

In [2]:
#Imports
from datasets import load_dataset, Dataset, load_metric
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer as at
from transformers import DataCollatorWithPadding as dcwp
from transformers import AutoModelForSequenceClassification as amfsc
from transformers import DataCollatorWithPadding as dcwp

#Compute the accuacy of the model once completed training.
def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

#The function that tokenizes the text.
def encode(data):
    return tokenizer(data["text"], truncation=True, padding=True)

#Load and split the data sets.
dataset = load_dataset('csv', data_files=['positive_clean.csv', 'negative_clean.csv'])['train'].shuffle(seed=41).train_test_split(test_size=0.2)

#Specify the model to be used for fine tuning.
model_name = 'roberta-large'
model = amfsc.from_pretrained(model_name, num_labels=2)
tokenizer = at.from_pretrained(model_name)
data_collator = dcwp(tokenizer=tokenizer)


Using custom data configuration default-7809931090712c0e
Reusing dataset csv (/home/reilly/.cache/huggingface/datasets/csv/default-7809931090712c0e/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/reilly/.cache/huggingface/datasets/csv/default-7809931090712c0e/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-618203d2865f6021.arrow
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from

In [3]:
#Tokenize the data using a map function to speed up the process.
token_train = dataset['train'].map(encode, batched=True)
token_test = dataset['test'].map(encode, batched=True)

  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [4]:
#Setting the Paramaters for fine tunning.
training_args = TrainingArguments(
   output_dir='roberta15000',
   learning_rate=2e-5,
   per_device_train_batch_size=14,
   per_device_eval_batch_size=14,
   num_train_epochs=12,
   weight_decay=0.01,
   save_strategy="epoch",
)


trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=token_train,
   eval_dataset=token_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [5]:
#Train the model. Took roughly 1 hour with two rtx 3090's
#Had trouble fitting the models in VRAM. It was close and dependant on other system processes.
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 12000
  Num Epochs = 12
  Instantaneous batch size per device = 14
  Total train batch size (w. parallel, distributed & accumulation) = 28
  Gradient Accumulation steps = 1
  Total optimization steps = 5148


Step,Training Loss
500,0.5463
1000,0.3964
1500,0.3158
2000,0.2325
2500,0.1798
3000,0.1427
3500,0.1106
4000,0.0936
4500,0.0667
5000,0.058


Saving model checkpoint to test_15000_samples/checkpoint-429
Configuration saved in test_15000_samples/checkpoint-429/config.json
Model weights saved in test_15000_samples/checkpoint-429/pytorch_model.bin
tokenizer config file saved in test_15000_samples/checkpoint-429/tokenizer_config.json
Special tokens file saved in test_15000_samples/checkpoint-429/special_tokens_map.json
Saving model checkpoint to test_15000_samples/checkpoint-858
Configuration saved in test_15000_samples/checkpoint-858/config.json
Model weights saved in test_15000_samples/checkpoint-858/pytorch_model.bin
tokenizer config file saved in test_15000_samples/checkpoint-858/tokenizer_config.json
Special tokens file saved in test_15000_samples/checkpoint-858/special_tokens_map.json
Saving model checkpoint to test_15000_samples/checkpoint-1287
Configuration saved in test_15000_samples/checkpoint-1287/config.json
Model weights saved in test_15000_samples/checkpoint-1287/pytorch_model.bin
tokenizer config file saved in tes

TrainOutput(global_step=5148, training_loss=0.20953205452469215, metrics={'train_runtime': 3954.152, 'train_samples_per_second': 36.417, 'train_steps_per_second': 1.302, 'total_flos': 9.301279305256848e+16, 'train_loss': 0.20953205452469215, 'epoch': 12.0})

In [7]:
#Evaluate the model using the evaluate function
#Initial performace on distil-bert and bert was lower at 0.76 accuracy and f1 score
#roBERTa preformed approximatley 5% better at .81 accuracy and f1 score, this is on the low side of accceptable.
#The f1 score is a good metric to use as it accounts for false positives in its calculations.
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3000
  Batch size = 28


{'eval_loss': 1.2885855436325073,
 'eval_accuracy': 0.81,
 'eval_f1': 0.8105053191489362,
 'eval_runtime': 14.8971,
 'eval_samples_per_second': 201.382,
 'eval_steps_per_second': 7.25,
 'epoch': 12.0}

In [None]:
#I will continue to explore using other models on my collected data set