# Body data

In [1]:
# Importing libriaries and functions
import pandas as pd
import re #https://docs.python.org/3/library/re.html

# Load data
wsb_data = pd.read_csv("../data/reddit_wsb.csv")

# Considering the dataset available I am extracting only data from 2021 
# because of one outlier in 2020 4 months before the next post
wsb_data = wsb_data[pd.to_datetime(wsb_data.timestamp).dt.year>=2021] 

In [2]:
# There are 28,449 rows with missing values in the 'body' column. 
# I will first work with the column 'body' and drop the rows with missing values (version 1).
# I will then do the same analysis on the 'title' data (version 2).
body_data = wsb_data[['body', 'timestamp']].copy()
body_data = body_data.dropna(subset=['body'])

# Display data
body_data

Unnamed: 0,body,timestamp
2,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35
6,Hedgefund whales are spreading disinfo saying ...,2021-01-28 21:26:27
7,Life isn't fair. My mother always told me that...,2021-01-28 21:19:31
10,I believe right now is one of those rare oppo...,2021-01-28 21:18:25
12,You guys are champs. GME... who would have tho...,2021-01-28 21:17:10
...,...,...
53181,"It’s all contingent on them mastering FSD, but...",2021-08-02 17:11:36
53182,***TLDR: Three bitter scientists partnered up ...,2021-08-02 15:03:27
53183,\nYour daily hype thread. Please keep the shit...,2021-08-02 15:01:03
53185,Your daily trading discussion thread. Please k...,2021-08-02 13:00:16


# Finding tickers in posts (body)

In [3]:
# Loading a list of tickers from NYSE
# https://github.com/datasets/nyse-other-listings/blob/main/data/other-listed.csv
nyse_tickers = pd.read_csv("../data/nyse-listed.csv")

# Extract the tickers from the 'ACT Symbol' column
ticker_list = list(nyse_tickers['ACT Symbol'])

# Function to extract unique tickers from text
def extract_tickers(text, ticker_list):
    # Find all uppercase words (possible tickers)
    words = re.findall(r'\b[A-Z]{1,4}\b', text)
    # Match against the ticker list and deduplicate
    return list(set(word for word in words if word in ticker_list))

# Apply the function to the 'body' column
body_data['tickers'] = body_data['body'].apply(lambda x: extract_tickers(x, ticker_list))

# Display data
body_data

Unnamed: 0,body,timestamp,tickers
2,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35,[GME]
6,Hedgefund whales are spreading disinfo saying ...,2021-01-28 21:26:27,"[GME, BE, T]"
7,Life isn't fair. My mother always told me that...,2021-01-28 21:19:31,"[ARE, GME, A, BB]"
10,I believe right now is one of those rare oppo...,2021-01-28 21:18:25,"[GME, AMC]"
12,You guys are champs. GME... who would have tho...,2021-01-28 21:17:10,"[AG, OUT, GME, BE, JPM]"
...,...,...,...
53181,"It’s all contingent on them mastering FSD, but...",2021-08-02 17:11:36,[]
53182,***TLDR: Three bitter scientists partnered up ...,2021-08-02 15:03:27,"[A, F, DD]"
53183,\nYour daily hype thread. Please keep the shit...,2021-08-02 15:01:03,[DD]
53185,Your daily trading discussion thread. Please k...,2021-08-02 13:00:16,[DD]


# Text cleaning

In [4]:
# with inspiration from this code: 
# https://github.com/ida-code88/WallStreetBets-Sentiment-Analysis/blob/main/WallStreetBets%20Sentiment%20Analysis%20Final.ipynb

def clean_text(text):
    # Ensure it's a string and converts to lower case
    text = str(text).lower()
    # Remove text in square brackets
    text = re.sub(r'\[.*?\]', ' ', text)
    # Remove URLs
    text = re.sub(r'http\S+', ' ', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>+', ' ', text)
    # Remove handles (fx @name)
    text = re.sub(r'@[^\s]+', ' ', text)
    # Keep only word characters
    text = ' '.join(re.findall(r'\w+', text))
    # Remove single-character words
    text = re.sub(r'\s+[a-zA-Z]\s', ' ', text)
    # Substitute multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    return text.strip()

# Apply this function to DataFrame column
body_data['body'] = body_data['body'].apply(clean_text)

# Display data
body_data


Unnamed: 0,body,timestamp,tickers
2,the ceo of nasdaq pushed to halt trading to gi...,2021-01-28 21:30:35,[GME]
6,hedgefund whales are spreading disinfo saying ...,2021-01-28 21:26:27,"[GME, BE, T]"
7,life isn fair my mother always told me that wh...,2021-01-28 21:19:31,"[ARE, GME, A, BB]"
10,i believe right now is one of those rare oppor...,2021-01-28 21:18:25,"[GME, AMC]"
12,you guys are champs gme who would have thought...,2021-01-28 21:17:10,"[AG, OUT, GME, BE, JPM]"
...,...,...,...
53181,it all contingent on them mastering fsd but if...,2021-08-02 17:11:36,[]
53182,tldr three bitter scientists partnered up with...,2021-08-02 15:03:27,"[A, F, DD]"
53183,your daily hype thread please keep the shitpos...,2021-08-02 15:01:03,[DD]
53185,your daily trading discussion thread please ke...,2021-08-02 13:00:16,[DD]


In [5]:
# Save data to CSV
body_data.to_csv("../data/cleaned_wsb_body_withtime.csv", index=False)

In [6]:
# Remove time from 'timestamp' and only keeping the date
body_data.timestamp = pd.to_datetime(body_data.timestamp).dt.date

# Display data
body_data

Unnamed: 0,body,timestamp,tickers
2,the ceo of nasdaq pushed to halt trading to gi...,2021-01-28,[GME]
6,hedgefund whales are spreading disinfo saying ...,2021-01-28,"[GME, BE, T]"
7,life isn fair my mother always told me that wh...,2021-01-28,"[ARE, GME, A, BB]"
10,i believe right now is one of those rare oppor...,2021-01-28,"[GME, AMC]"
12,you guys are champs gme who would have thought...,2021-01-28,"[AG, OUT, GME, BE, JPM]"
...,...,...,...
53181,it all contingent on them mastering fsd but if...,2021-08-02,[]
53182,tldr three bitter scientists partnered up with...,2021-08-02,"[A, F, DD]"
53183,your daily hype thread please keep the shitpos...,2021-08-02,[DD]
53185,your daily trading discussion thread please ke...,2021-08-02,[DD]


In [7]:
# Save to CSV file
body_data.to_csv("../data/cleaned_wsb_body_withouttime.csv", index=False)