In [18]:
import pandas as pd
import glob
import datetime
import import_ipynb
from NER import get_entities

In [19]:
data = pd.DataFrame()
# read csv files int to a dataframe 
for file_name in glob.glob('../Data_Collecting/data_files/all_data_streams/'+'*.csv'):
    df = pd.read_csv(file_name, lineterminator='\n',index_col=0)
    data = pd.concat([data, df])

In [20]:
data.shape

(643771, 9)

In [26]:
data.sample(3)

Unnamed: 0,comment_id,created,fullname,parent_id,subreddit,text,post_id,title,Unnamed: 0.1
628,gzo5pen,2021-05-27 19:08:52,t1_gzo5pen,t1_gzo5oh1,wallstreetbets,Cash,,,
7134,gyqutte,2021-05-19 21:11:27,t1_gyqutte,t1_gyqmde3,wallstreetbets,Mine are up 16% today.,,,
6156,,2021-05-24 19:44:26,t3_nk6h2q,,wallstreetbets,,nk6h2q,An interesting angle on E.S.G. Investing,


## Clean data from data set

In [15]:
# Change timestamp to date (datetime.date)
def datetime_to_date(timestamp):
    return pd.to_datetime(timestamp).date()

In [16]:
data = data.drop(['Unnamed: 0.1', 'title'],axis=1)
data['fullname'] = data['fullname'].apply(lambda x: x[:2])
data['created'] = data['created'].apply(datetime_to_date)
data.rename(columns = {'fullname':'type'}, inplace = True) #t3 = submission, t1 = comment
data.rename(columns = {'post_id':'submission_id'}, inplace = True)
data.head()

Unnamed: 0,comment_id,created,type,parent_id,subreddit,text,submission_id
0,gylrunc,2021-05-18,t1,t3_nf66zs,stocks,And it’s all going green again,
1,gylrwn7,2021-05-18,t1,t1_gylh7q1,stocks,Uh no. I already signed up forv$10K add on to ...,
2,gylrzsa,2021-05-18,t1,t1_gylhiir,stocks,Billionaire investors aren’t actually any smar...,
3,gylrzv8,2021-05-18,t1,t1_gylhiir,stocks,"Well, he’s a monkey, so...",
4,gyls2uk,2021-05-18,t1,t3_nfj008,stocks,There are only 2 rules i follow in investing:\...,


In [17]:
data.describe()

Unnamed: 0,comment_id,created,type,parent_id,subreddit,text,submission_id
count,593201,643771,643771,593201,643771,615296,50570
unique,591032,33,2,243814,7,539955,5795
top,gzfmh9p,2021-05-28,t1,t3_nmue5k,wallstreetbets,I am a bot from /r/wallstreetbets. Your submis...,nera1s
freq,4,93654,593201,18336,555055,3982,83


In [18]:
data['subreddit'].value_counts()

wallstreetbets    555055
stocks             35377
investing          22308
StockMarket        14014
pennystocks        12012
algotrading         4004
RobinHood           1001
Name: subreddit, dtype: int64

In [29]:
data['submission_id'].fillna("", inplace=True)
data['comment_id'].fillna("", inplace=True)
data['parent_id'].fillna("", inplace=True)
# count the missing values
data.isnull().sum()
data.dropna(inplace=True)

comment_id       0
created          0
type             0
parent_id        0
subreddit        0
text             0
submission_id    0
dtype: int64

## Add stock mentions to data set

In [74]:
TOP_20 = ['AMC','GME','PLTR','AAPL','SPCE','AMZN','FORD','AMD','TSLA','SEC','ATH','NOK','OTM','ITM','AI','COIN','VIX','BB','RBLX','HODL']
TICKER_DICT = {"GameStop":"GME","Palantir Technologies":"PLTR","Apple":"AAPL","Virgin Galactic":"SPCE","Amazon":"AMZN","Ford":"FORD","Advanced Micro Devices":"AMD","Tesla":"TSLA","Senvest Capital":"SEC","Senvest":"SEC", "Athene Holding":"ATH", "Athene": "ATH","Nokia":"NOK","ZPC Otmuchow SA":"OTM", "ZPC Otmuchow":"OTM","ITM Power":"ITM","BlackBerry": "BB","c3.AI":"AI", "Coinbase Global":"COIN", "Coinbase":"COIN","Roblox":"RBLX","Cypherpunk":"HODL"}

In [75]:
# cleaning the orgs lists (no dublicates, only top 20 stocks, only tickers)
def clean_orgs(organizations):
    orgs = []
    for org in organizations:
        if org in TOP_20:
            orgs.append(org)
        elif org in TICKER_DICT:
            org = TICKER_DICT[org]
            orgs.append(org)
    for org in orgs:
        o = set(orgs)
        orgs = list(o)
        
    return orgs

In [78]:
data['Organizations'] = data['text'].apply(get_entities).apply(clean_orgs)

## Save to csv

In [81]:
data.to_csv('./data_files/clean_data_with_orgs.csv')