In [40]:
#Imports here, no need for structure
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer



nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /Users/trevorle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/trevorle/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/trevorle/nltk_data...


True

In [41]:
#pull in the data 
import pandas as pd

df = pd.read_csv('reddit_wsb.csv')
df.dropna(inplace=True)

df.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35
6,SHORT STOCK DOESN'T HAVE AN EXPIRATION DATE,317,l6uf6d,https://www.reddit.com/r/wallstreetbets/commen...,53,1611862000.0,Hedgefund whales are spreading disinfo saying ...,2021-01-28 21:26:27
7,THIS IS THE MOMENT,405,l6ub9l,https://www.reddit.com/r/wallstreetbets/commen...,178,1611862000.0,Life isn't fair. My mother always told me that...,2021-01-28 21:19:31
10,"We need to keep this movement going, we all ca...",222,l6uao1,https://www.reddit.com/r/wallstreetbets/commen...,70,1611862000.0,I believe right now is one of those rare oppo...,2021-01-28 21:18:25
12,"Once you're done with GME - $AG and $SLV, the ...",0,l6u9wu,https://www.reddit.com/r/wallstreetbets/commen...,16,1611861000.0,You guys are champs. GME... who would have tho...,2021-01-28 21:17:10


In [42]:
#tokenization
def tokenize_title(text):
    return word_tokenize(text)

def tokenize_body(text):
    return sent_tokenize(text)

df['tokenized_title'] = df['title'].apply(tokenize_title)

df['tokenized_body'] = df['body'].apply(tokenize_body)

df


Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp,tokenized_title,tokenized_body
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1.611862e+09,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35,"[Exit, the, system]",[The CEO of NASDAQ pushed to halt trading “to ...
6,SHORT STOCK DOESN'T HAVE AN EXPIRATION DATE,317,l6uf6d,https://www.reddit.com/r/wallstreetbets/commen...,53,1.611862e+09,Hedgefund whales are spreading disinfo saying ...,2021-01-28 21:26:27,"[SHORT, STOCK, DOES, N'T, HAVE, AN, EXPIRATION...",[Hedgefund whales are spreading disinfo saying...
7,THIS IS THE MOMENT,405,l6ub9l,https://www.reddit.com/r/wallstreetbets/commen...,178,1.611862e+09,Life isn't fair. My mother always told me that...,2021-01-28 21:19:31,"[THIS, IS, THE, MOMENT]","[Life isn't fair., My mother always told me th..."
10,"We need to keep this movement going, we all ca...",222,l6uao1,https://www.reddit.com/r/wallstreetbets/commen...,70,1.611862e+09,I believe right now is one of those rare oppo...,2021-01-28 21:18:25,"[We, need, to, keep, this, movement, going, ,,...",[ I believe right now is one of those rare opp...
12,"Once you're done with GME - $AG and $SLV, the ...",0,l6u9wu,https://www.reddit.com/r/wallstreetbets/commen...,16,1.611861e+09,You guys are champs. GME... who would have tho...,2021-01-28 21:17:10,"[Once, you, 're, done, with, GME, -, $, AG, an...","[You guys are champs., GME... who would have t..."
...,...,...,...,...,...,...,...,...,...,...
53181,Ten Year Price Prediction for TSLA,156,owfbxp,https://www.reddit.com/r/wallstreetbets/commen...,204,1.627913e+09,"It’s all contingent on them mastering FSD, but...",2021-08-02 17:11:36,"[Ten, Year, Price, Prediction, for, TSLA]","[It’s all contingent on them mastering FSD, bu..."
53182,What I Learned Investigating SAVA FUD Spreaders,238,owd2pn,https://www.reddit.com/r/wallstreetbets/commen...,87,1.627906e+09,***TLDR: Three bitter scientists partnered up ...,2021-08-02 15:03:27,"[What, I, Learned, Investigating, SAVA, FUD, S...",[***TLDR: Three bitter scientists partnered up...
53183,"Daily Popular Tickers Thread for August 02, 20...",228,owd1a5,https://www.reddit.com/r/wallstreetbets/commen...,1070,1.627906e+09,\nYour daily hype thread. Please keep the shit...,2021-08-02 15:01:03,"[Daily, Popular, Tickers, Thread, for, August,...","[\nYour daily hype thread., Please keep the sh..."
53185,"Daily Discussion Thread for August 02, 2021",338,owbfjf,https://www.reddit.com/r/wallstreetbets/commen...,11688,1.627898e+09,Your daily trading discussion thread. Please k...,2021-08-02 13:00:16,"[Daily, Discussion, Thread, for, August, 02, ,...","[Your daily trading discussion thread., Please..."


In [44]:
#normalization
#1st apply everything into lower case
df['title_lower'] = df['tokenized_title'].apply(lambda x: [token.lower() for token in x])

df['body_lower'] = df['tokenized_body'].apply(lambda x: [token.lower() for token in x])

#2nd remove all of the stop words.
stop_words = set(stopwords.words('english'))

df['title_no_stopwords'] = df['tokenized_title'].apply(lambda x: [token for token in x if token.lower() not in stop_words])

df['body_no_stopwords'] = df['tokenized_body'].apply(lambda x: [token for token in x if token.lower() not in stop_words])

#3rd apply lemmatizer
lemmatizer = WordNetLemmatizer()
df['title_lemmatized'] = df['tokenized_title'].apply(lambda x: [lemmatizer.lemmatize(token) for token in x])

#printing and each column contains a seperate normalization
df

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp,tokenized_title,tokenized_body,title_lower,body_lower,title_no_stopwords,body_no_stopwords,title_lemmatized
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1.611862e+09,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35,"[Exit, the, system]",[The CEO of NASDAQ pushed to halt trading “to ...,"[exit, the, system]",[the ceo of nasdaq pushed to halt trading “to ...,"[Exit, system]",[The CEO of NASDAQ pushed to halt trading “to ...,"[Exit, the, system]"
6,SHORT STOCK DOESN'T HAVE AN EXPIRATION DATE,317,l6uf6d,https://www.reddit.com/r/wallstreetbets/commen...,53,1.611862e+09,Hedgefund whales are spreading disinfo saying ...,2021-01-28 21:26:27,"[SHORT, STOCK, DOES, N'T, HAVE, AN, EXPIRATION...",[Hedgefund whales are spreading disinfo saying...,"[short, stock, does, n't, have, an, expiration...",[hedgefund whales are spreading disinfo saying...,"[SHORT, STOCK, N'T, EXPIRATION, DATE]",[Hedgefund whales are spreading disinfo saying...,"[SHORT, STOCK, DOES, N'T, HAVE, AN, EXPIRATION..."
7,THIS IS THE MOMENT,405,l6ub9l,https://www.reddit.com/r/wallstreetbets/commen...,178,1.611862e+09,Life isn't fair. My mother always told me that...,2021-01-28 21:19:31,"[THIS, IS, THE, MOMENT]","[Life isn't fair., My mother always told me th...","[this, is, the, moment]","[life isn't fair., my mother always told me th...",[MOMENT],"[Life isn't fair., My mother always told me th...","[THIS, IS, THE, MOMENT]"
10,"We need to keep this movement going, we all ca...",222,l6uao1,https://www.reddit.com/r/wallstreetbets/commen...,70,1.611862e+09,I believe right now is one of those rare oppo...,2021-01-28 21:18:25,"[We, need, to, keep, this, movement, going, ,,...",[ I believe right now is one of those rare opp...,"[we, need, to, keep, this, movement, going, ,,...",[ i believe right now is one of those rare opp...,"[need, keep, movement, going, ,, make, history...",[ I believe right now is one of those rare opp...,"[We, need, to, keep, this, movement, going, ,,..."
12,"Once you're done with GME - $AG and $SLV, the ...",0,l6u9wu,https://www.reddit.com/r/wallstreetbets/commen...,16,1.611861e+09,You guys are champs. GME... who would have tho...,2021-01-28 21:17:10,"[Once, you, 're, done, with, GME, -, $, AG, an...","[You guys are champs., GME... who would have t...","[once, you, 're, done, with, gme, -, $, ag, an...","[you guys are champs., gme... who would have t...","['re, done, GME, -, $, AG, $, SLV, ,, gentlema...","[You guys are champs., GME... who would have t...","[Once, you, 're, done, with, GME, -, $, AG, an..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53181,Ten Year Price Prediction for TSLA,156,owfbxp,https://www.reddit.com/r/wallstreetbets/commen...,204,1.627913e+09,"It’s all contingent on them mastering FSD, but...",2021-08-02 17:11:36,"[Ten, Year, Price, Prediction, for, TSLA]","[It’s all contingent on them mastering FSD, bu...","[ten, year, price, prediction, for, tsla]","[it’s all contingent on them mastering fsd, bu...","[Ten, Year, Price, Prediction, TSLA]","[It’s all contingent on them mastering FSD, bu...","[Ten, Year, Price, Prediction, for, TSLA]"
53182,What I Learned Investigating SAVA FUD Spreaders,238,owd2pn,https://www.reddit.com/r/wallstreetbets/commen...,87,1.627906e+09,***TLDR: Three bitter scientists partnered up ...,2021-08-02 15:03:27,"[What, I, Learned, Investigating, SAVA, FUD, S...",[***TLDR: Three bitter scientists partnered up...,"[what, i, learned, investigating, sava, fud, s...",[***tldr: three bitter scientists partnered up...,"[Learned, Investigating, SAVA, FUD, Spreaders]",[***TLDR: Three bitter scientists partnered up...,"[What, I, Learned, Investigating, SAVA, FUD, S..."
53183,"Daily Popular Tickers Thread for August 02, 20...",228,owd1a5,https://www.reddit.com/r/wallstreetbets/commen...,1070,1.627906e+09,\nYour daily hype thread. Please keep the shit...,2021-08-02 15:01:03,"[Daily, Popular, Tickers, Thread, for, August,...","[\nYour daily hype thread., Please keep the sh...","[daily, popular, tickers, thread, for, august,...","[\nyour daily hype thread., please keep the sh...","[Daily, Popular, Tickers, Thread, August, 02, ...","[\nYour daily hype thread., Please keep the sh...","[Daily, Popular, Tickers, Thread, for, August,..."
53185,"Daily Discussion Thread for August 02, 2021",338,owbfjf,https://www.reddit.com/r/wallstreetbets/commen...,11688,1.627898e+09,Your daily trading discussion thread. Please k...,2021-08-02 13:00:16,"[Daily, Discussion, Thread, for, August, 02, ,...","[Your daily trading discussion thread., Please...","[daily, discussion, thread, for, august, 02, ,...","[your daily trading discussion thread., please...","[Daily, Discussion, Thread, August, 02, ,, 2021]","[Your daily trading discussion thread., Please...","[Daily, Discussion, Thread, for, August, 02, ,..."


In [None]:
#I think this spot needs to do some TF-IDF stuff 
from sklearn.feature_extraction.text import TfidfVectorizer
