In [None]:
!pip install ipynb --upgrade

In [1]:
from ipynb.fs.full.NER import get_entities
import pandas as pd 
import datetime
import requests
from collections import Counter

### Load csv

In [2]:
data = pd.read_csv('data_files/wallstreetbets_26.csv')

----

## Transforming the data

In [3]:
# Returns list with submissions dataframe and comments dataframe
def transform_reddit_data(data: pd.DataFrame):
        dfs = []
        # submissions
        df_submissions = data.drop(['Unnamed: 0','comment_id', 'fullname', 'parent_id', 'title'],axis=1)
        df_submissions['created'] = df_submissions['created'].apply(datetime_to_date)
        df_submissions = df_submissions.dropna()
        df_submissions.rename(columns = {'post_id':'id'}, inplace = True)
        df_submissions['Organizations'] = df_submissions['text'].apply(get_entities).apply(clean_orgs)
        dfs.append(df_submissions)
        # comments
        df_comments = data.drop(['Unnamed: 0', 'fullname', 'post_id', 'title'],axis=1)
        df_comments['created'] = df_comments['created'].apply(datetime_to_date)
        df_comments = df_comments.dropna()
        df_comments['Organizations'] = df_comments['text'].apply(get_entities).apply(clean_orgs)
        dfs.append(df_comments)
        
        return dfs

In [4]:
# Changes utc timestamp to datetime.date
def datetime_to_date(timestamp):
    return pd.to_datetime(timestamp).date()

In [5]:
# Mentions that we are interested in
selected_orgs =  ['HKG', 'Alibaba','AMC', 'Palantir Technologies', 'PLTR', 'FORD', 'Lordstown Motors', 'RIDE', 'Virgin Galactic', 'SPCE', 'AI', 'C3.AI', 'TSLA', 'GE', 'GME', 'AAPL', 'Tesla', 'Apple', 'General Electric', 'GE', 'NOK', 'Nokia']
orgs_dict = {"Alibaba":"HKG","AMC":"AMC","Palantir Technologies":"PLTR","FORD":"FORD", "Lordstown Motors":"RIDE","Virgin Galactic":"SPCE","c3.AI":"AI","Tesla":"TSLA", "General Electric":"GE","Apple":"AAPL","GameStop":"GME","Gamestop":"GME", "Nokia":"NOK"}


In [6]:
# returning a list of mentioned tickers
def clean_orgs(organizations):
    orgs = []
    for org in organizations:
        if org in selected_orgs:
            if org in orgs_dict:
                org = orgs_dict[org]
                orgs.append(org)
            else: 
                orgs.append(org)
    for org in orgs:
        o = set(orgs)
        orgs = list(o)
    return orgs

In [7]:
clean_orgs(['Virgin Galactic'])

['SPCE']

## A Look at the mentions

In [None]:
# Creates a list of mentionend entities
data['Organizations'] = data['text'].apply(get_entities)
orgs = data['Organizations'].to_list()
orgs_flat = [org for sublist in orgs for org in sublist] # Pulls out entities from the nested lists in orgs => new flat list
# Print 20 most mentions ORGs
from collections import Counter
org_freq = Counter(orgs_flat)
org_freq.most_common(20)                                                       

## Transform and save the data to database

In [8]:
# Transforming into lists of submissions and comments
dfs = transform_reddit_data(data)
submissions = dfs[0]
comments = dfs[1]

In [None]:
# SUBMISSIONS subset
df = submissions[1:10]
insert_submissions(submissions)
#COMMENTS subset
df_c = comments[1:29]
#df_c
insert_comments(comments)

#insert_c(df_c)

connection/replies/comment/gz8lz7o/gz8ljj3
connection/replies/submission/gz8lz8g/njp6ow
connection/replies/submission/gz8m00o/njfshj
connection/replies/comment/gz8m0b0/gz8kqap
connection/replies/comment/gz8m0bm/gz8ltpb
connection/replies/submission/gz8m0o0/njp9qf
connection/replies/comment/gz8m0q4/gz8lqpt
connection/replies/comment/gz8m0qi/gz72pqq
connection/replies/comment/gz8m0y1/gz8lssq
connection/replies/comment/gz8m10q/gz8l2qq
connection/replies/comment/gz8m17w/gz8lvyz
connection/replies/comment/gz8m18z/gz8lx0l
connection/replies/submission/gz8m192/njfshj
connection/replies/comment/gz8m1da/gz8l2m2
connection/replies/submission/gz8m1fw/njp9qf
connection/replies/comment/gz8m1p8/gz8lwpc
connection/replies/comment/gz8m1s4/gz8lyl6
connection/replies/submission/gz8m1tz/njfshj
connection/replies/comment/gz8m1y2/gz8lpmh
connection/replies/comment/gz8m1z0/gz8ktkz
connection/replies/comment/gz8m2ek/gz8lumf
connection/replies/submission/gz8m2j5/njfshj
connection/replies/comment/gz8m2kf/gz8ek

In [10]:
url = 'http://localhost:5050/'

In [149]:
def insert_submissions(dataframe: pd.DataFrame):
    
    for index, row in dataframe.iterrows():
        
        #create submission nodes
        url2 = 'submission/'+row['subreddit']+'/'+row['created'].strftime('%m-%d-%Y')+'/'+row['id']
        response = requests.post(url+url2)
        if not response.status_code == 200:
            print ('submission id '+ row['id'] + ' failed insertion')
        # create mentions relationships
        for org in row['Organizations']:
            url3 = 'connection/mentions/submission/'+row['id']+'/'+ org
            response = requests.post(url+url3)
            if not response.status_code == 200:
                print ('mentions between '+ row['id'] + ' and ' + org + ' failed creation')

In [154]:
def insert_comments(dataframe: pd.DataFrame):
    
    for index, row in dataframe.iterrows():
        
        #create comment nodes
        url2 = 'comment/'+row['subreddit']+'/'+row['created'].strftime('%m-%d-%Y')+'/'+row['comment_id']+'/'+row['parent_id']
        response = requests.post(url+url2)
        if not response.status_code == 200:
            print ('comment id '+ row['comment_id'] + ' failed insertion')
            
        # create mentions relationships
        for org in row['Organizations']:
            url3 = 'connection/mentions/comment/'+row['comment_id']+'/'+ org
            response = requests.post(url+url3)
            if not response.status_code == 200:
                print ('mention between '+ row['comment_id'] + ' and ' + org + ' failed creation')
    
           
        # create replies relationships
        row['parent_id'] = pd.Series(row['parent_id'], dtype="string") #turning parent_id from type Series to String
        for parent_id in row['parent_id']:
            p_id = parent_id[3:]
            if parent_id[:2] == 't1':
                url_reply = 'connection/replies/comment/' + row['comment_id'] + '/' + p_id
            elif parent_id[:2] == 't3':
                url_reply = 'connection/replies/submission/' + row['comment_id'] + '/' + p_id
            response = requests.post(url+url_reply)
            if not response.status_code == 200:
                print ('replies relationship between '+ row['comment_id'] + ' and ' + p_id + ' failed creation')