# Title data

In [1]:
# Importing libriaries and functions
import pandas as pd
import re #https://docs.python.org/3/library/re.html

# Load data
wsb_data = pd.read_csv("../data/reddit_wsb.csv")

# Considering the dataset available I am extracting only data from 2021 
# because of one outlier in 2020 4 months before the next post
wsb_data = wsb_data[pd.to_datetime(wsb_data.timestamp).dt.year>=2021] 

In [2]:
# There are 28,449 rows with missing values in the 'body' column. 
# I will first work with the column 'body' and drop the rows with missing values (version 1).
# I will then do the same analysis on the 'title' data (version 2).
title_data = wsb_data[['title', 'timestamp']].copy()
title_data = title_data.dropna(subset=['title'])

# Display data
title_data

Unnamed: 0,title,timestamp
0,"It's not about the money, it's about sending a...",2021-01-28 21:37:41
1,Math Professor Scott Steiner says the numbers ...,2021-01-28 21:32:10
2,Exit the system,2021-01-28 21:30:35
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,2021-01-28 21:28:57
4,"Not to distract from GME, just thought our AMC...",2021-01-28 21:26:56
...,...,...
53182,What I Learned Investigating SAVA FUD Spreaders,2021-08-02 15:03:27
53183,"Daily Popular Tickers Thread for August 02, 20...",2021-08-02 15:01:03
53184,Hitler reacts to the market being irrational,2021-08-02 13:59:35
53185,"Daily Discussion Thread for August 02, 2021",2021-08-02 13:00:16


# Finding tickers in posts (title)

In [3]:
# Loading a list of tickers from NYSE
# https://github.com/datasets/nyse-other-listings/blob/main/data/other-listed.csv
nyse_tickers = pd.read_csv("../data/nyse-listed.csv")

# Extract the tickers from the 'ACT Symbol' column
ticker_list = list(nyse_tickers['ACT Symbol'])

# Function to extract unique tickers from text
def extract_tickers(text, ticker_list):
    # Find all uppercase words (possible tickers)
    words = re.findall(r'\b[A-Z]{1,4}\b', text)
    # Match against the ticker list and deduplicate
    return list(set(word for word in words if word in ticker_list))

# Apply the function to the 'body' column
title_data['tickers'] = title_data['title'].apply(lambda x: extract_tickers(x, ticker_list))

# Display data
title_data

Unnamed: 0,title,timestamp,tickers
0,"It's not about the money, it's about sending a...",2021-01-28 21:37:41,[]
1,Math Professor Scott Steiner says the numbers ...,2021-01-28 21:32:10,[]
2,Exit the system,2021-01-28 21:30:35,[]
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,2021-01-28 21:28:57,"[GME, FOR]"
4,"Not to distract from GME, just thought our AMC...",2021-01-28 21:26:56,"[GME, AMC]"
...,...,...,...
53182,What I Learned Investigating SAVA FUD Spreaders,2021-08-02 15:03:27,[]
53183,"Daily Popular Tickers Thread for August 02, 20...",2021-08-02 15:01:03,"[GME, BABA]"
53184,Hitler reacts to the market being irrational,2021-08-02 13:59:35,[]
53185,"Daily Discussion Thread for August 02, 2021",2021-08-02 13:00:16,[]


# Text cleaning

In [4]:
# with inspiration from this code: 
# https://github.com/ida-code88/WallStreetBets-Sentiment-Analysis/blob/main/WallStreetBets%20Sentiment%20Analysis%20Final.ipynb

def clean_text(text):
    # Ensure it's a string and converts to lower case
    text = str(text).lower()
    # Remove text in square brackets
    text = re.sub(r'\[.*?\]', ' ', text)
    # Remove URLs
    text = re.sub(r'http\S+', ' ', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>+', ' ', text)
    # Remove handles (fx @name)
    text = re.sub(r'@[^\s]+', ' ', text)
    # Keep only word characters
    text = ' '.join(re.findall(r'\w+', text))
    # Remove single-character words
    text = re.sub(r'\s+[a-zA-Z]\s', ' ', text)
    # Substitute multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    return text.strip()

# Apply this function to DataFrame column
title_data['title'] = title_data['title'].apply(clean_text)

# Display data
title_data


Unnamed: 0,title,timestamp,tickers
0,it not about the money it about sending message,2021-01-28 21:37:41,[]
1,math professor scott steiner says the numbers ...,2021-01-28 21:32:10,[]
2,exit the system,2021-01-28 21:30:35,[]
3,new sec filing for gme can someone less retard...,2021-01-28 21:28:57,"[GME, FOR]"
4,not to distract from gme just thought our amc ...,2021-01-28 21:26:56,"[GME, AMC]"
...,...,...,...
53182,what learned investigating sava fud spreaders,2021-08-02 15:03:27,[]
53183,daily popular tickers thread for august 02 202...,2021-08-02 15:01:03,"[GME, BABA]"
53184,hitler reacts to the market being irrational,2021-08-02 13:59:35,[]
53185,daily discussion thread for august 02 2021,2021-08-02 13:00:16,[]


In [5]:
# Save data to CSV
title_data.to_csv("../data/cleaned_wsb_title_withtime.csv", index=False)

In [6]:
# Remove time from 'timestamp' and only keeping the date
title_data.timestamp = pd.to_datetime(title_data.timestamp).dt.date

# Display data
title_data

Unnamed: 0,title,timestamp,tickers
0,it not about the money it about sending message,2021-01-28,[]
1,math professor scott steiner says the numbers ...,2021-01-28,[]
2,exit the system,2021-01-28,[]
3,new sec filing for gme can someone less retard...,2021-01-28,"[GME, FOR]"
4,not to distract from gme just thought our amc ...,2021-01-28,"[GME, AMC]"
...,...,...,...
53182,what learned investigating sava fud spreaders,2021-08-02,[]
53183,daily popular tickers thread for august 02 202...,2021-08-02,"[GME, BABA]"
53184,hitler reacts to the market being irrational,2021-08-02,[]
53185,daily discussion thread for august 02 2021,2021-08-02,[]


In [7]:
# Save to CSV file
title_data.to_csv("../data/cleaned_wsb_title_withouttime.csv", index=False)