In [1]:
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
tweets_main = pd.read_csv(f'../datasets/tweets_full.csv')
analyzer = SentimentIntensityAnalyzer()

In [189]:
company = "TSLA"
tweets = tweets_main[tweets_main["ticker_symbol"] == company].copy().reset_index(drop=True)

In [3]:
def clean_body(col):
    import re

    # usun urle
    # usun hashtagi
    pattern_url = r"https?://[a-z.]+/[?a-z0-9./]+"
    pattern_hash_dolla = r"[\$#][a-z.]+"
    col = re.sub(pattern_url, "", col, flags=re.I)
    col = re.sub(pattern_hash_dolla, "", col, flags=re.I)

    return col


def convert_dict_keyval_to_col(dict, key):
    new_col = []
    for i in dict:
        new_col.append(i[key])
    return new_col


def calc_weight(tweet):
    retweets = tweet.retweet_num
    likes = tweet.like_num
    comms = tweet.comment_num
    weight = 0
    if tweet.retweet_num == 0:
        weight = 1 + 0.05 * likes + 0.2 * comms
    else:
        weight = retweets * (1 + 0.05 * likes + 0.2 * comms)
    return weight


def calc_weight_2(tweet):
    retweets = tweet.retweet_num
    likes = tweet.like_num
    comms = tweet.comment_num

    weight = 1 + 0.5 * retweets + 0.05 * likes + 0.2 * comms
    return weight


def explore_weights(df, col):
    print(df[df[col] == max(df[col])].body)
    print(tweets[col].describe(percentiles=[0.05 * (i + 1) for i in range(19)]))


def percentage_weights(df, col):
    base = round(len(df.loc[df[col] == 1]) / len(df), 3) * 100
    one_three = len(df.loc[(df[col] > 1) & (df[col] < 3)]) / len(tweets) * 100
    three_five = len(df.loc[(df[col] > 3) & (df[col] < 10)]) / len(tweets) * 100

    print("Worth 1 tweet:", round(base, 3))
    print("Worth 1 to 3 tweets:", round(one_three, 3))
    print("Worth 3 to 5 tweets:", round(three_five, 3))
    print("Total: ", round(base, 3) + round(one_three, 3) + round(three_five, 3))


In [14]:
test = pd.DataFrame(
    {"body": tweets_main.body.apply(clean_body)}
)


In [15]:
test["len"] = test.body.str.len()

In [23]:
too_long = test.loc[test["len"] > 280, "body"].iloc[5]

In [24]:
too_long

'RT @businessEMA: U.S Patent: 8927176Current collector plates of bulk-solidifying amorphous alloys $AAPL $LQMT -Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=1&f=G&l=50&d=PTXT&p=1&S1=%22Crucible+Intellectual+Property%22&OS=%22Crucible+Intellectual+Property%22&RS=%22Crucible+Intellectual+Property%22…'

In [None]:
to_clean = '''RT @fakeusemame1: @fakeusemame2 Just bought
30.6 BTC from https://fakeexchange.com &amp; I am
hating the crash 😟 :(
Hope it recovers so...'''

In [191]:
def calc_sentiment(df, to_preserve):
    df = df.copy()
    
    sent = df["body"].apply(analyzer.polarity_scores)

    sentiment_val = pd.DataFrame()
    sentiment_val["pos"] = convert_dict_keyval_to_col(sent, "pos")
    sentiment_val["neu"] = convert_dict_keyval_to_col(sent, "neu")
    sentiment_val["neg"] = convert_dict_keyval_to_col(sent, "neg")
    sentiment_val["comp"] = convert_dict_keyval_to_col(sent, "compound")

    for col in to_preserve:
        sentiment_val[col] = df[col]

    return sentiment_val

In [192]:
tweets["weight"] = tweets.apply(calc_weight, axis=1)
#tweets["weight2"] = tweets.apply(calc_weight_2, axis=1)

In [193]:
tweets.columns

Index(['tweet_id', 'post_date', 'body', 'comment_num', 'retweet_num',
       'like_num', 'ticker_symbol', 'weight'],
      dtype='object')

In [194]:
sentiment = calc_sentiment(
    tweets,
    to_preserve=[
        "post_date",
        "ticker_symbol",
        "weight",
        # "weight2",
        "comment_num",
        "retweet_num",
        "like_num",
    ],
)


In [195]:
sentiment.columns

Index(['pos', 'neu', 'neg', 'comp', 'post_date', 'ticker_symbol', 'weight',
       'comment_num', 'retweet_num', 'like_num'],
      dtype='object')

In [196]:
# round down
def round_down_hour(col):
    try:
        date, time = col.split(" ")
        time = f"{time[0:2]}:00:00"
    except:
        print(col)
        raise Exception
    return date + " " + time


def aggregate_simple(df):
    aggregated = pd.pivot_table(
        df,
        index=["post_date"],
        aggfunc={
            "pos": ["max", "min", "std", "mean", "median", "count"],
            "neu": ["max", "min", "std", "mean", "median"],
            "neg": ["max", "min", "std", "mean", "median"],
            "comp": ["max", "min", "std", "mean", "median"],
            "comment_num": ["sum"],
            "retweet_num": ["sum"],
            "like_num": ["sum"],
        },
    ).reset_index()

    aggregated.columns = [f"{i}_{j}" for i, j in aggregated.columns]
    # clean columns
    aggregated.rename(
        columns={
            f"post_date_": f"post_date",
            f"pos_count": f"count",
        },
        inplace=True,
    )

    return aggregated


def reweight(df, weight):
    cols = ["pos", "neu", "neg", "comp"]
    for col in cols:
        df[col] *= df[weight]
    return df


def get_total_tweets(df):
    df["post_date"] = df["post_date"].apply(round_down_hour)
    grouped = pd.pivot_table(
        df,
        index=["post_date"],
        aggfunc=["count"],
    ).reset_index()
    grouped.columns = [f"{i}_{j}" for i, j in grouped.columns]
    grouped.rename(
        columns={
            f"post_date_": f"post_date",
            f"count_body": f"total_count",
        },
        inplace=True,
    )
    return grouped[["post_date", "total_count"]]


def get_relative_count(df, df_total):

    mrg = df.merge(df_total, how="left", on="post_date")
    df["relative_count"] = mrg["count"] / mrg["total_count"]
    return df


In [197]:
total_tweets = get_total_tweets(tweets_main)
total_tweets.head()

In [198]:
sentiment["post_date"] = sentiment["post_date"].apply(round_down_hour)

In [199]:
sentiment_w1 = reweight(sentiment.copy(), "weight")
#sentiment_w1 = sentiment_w1.drop(["weight", "weight2"], axis=1)
sentiment_w1 = sentiment_w1.drop(["weight"], axis=1)
sentiment_w1.head()


Unnamed: 0,pos,neu,neg,comp,post_date,ticker_symbol,comment_num,retweet_num,like_num
0,0.0,1.05,0.0,0.0,2015-01-01 01:00:00,TSLA,0,0,1
1,0.0,1.05,0.0,0.0,2015-01-01 01:00:00,TSLA,0,0,1
2,0.0,1.05,0.0,0.0,2015-01-01 01:00:00,TSLA,0,0,1
3,0.0,1.05,0.0,0.0,2015-01-01 01:00:00,TSLA,0,0,1
4,0.0,1.05,0.0,0.0,2015-01-01 01:00:00,TSLA,0,0,1


In [200]:
#sentiment_w2 = reweight(sentiment.copy(), "weight2")
#sentiment_w2 = sentiment_w2.drop(["weight", "weight2"], axis=1)
#sentiment_w2.head()

In [201]:
sentiment_no_weight = sentiment.drop(["weight"], axis=1)

In [202]:
agg_w1 = aggregate_simple(sentiment_w1)
agg_w1 = get_relative_count(agg_w1, total_tweets)
agg_w1 = agg_w1.fillna(0)

In [203]:
agg_w1.head()

Unnamed: 0,post_date,comment_num_sum,comp_max,comp_mean,comp_median,comp_min,comp_std,like_num_sum,neg_max,neg_mean,...,neu_min,neu_std,count,pos_max,pos_mean,pos_median,pos_min,pos_std,retweet_num_sum,relative_count
0,2015-01-01 01:00:00,0,0.22253,-0.000377,0.0,-0.2263,0.105793,10,0.16,0.016,...,0.84,0.066781,10,0.1056,0.01056,0.0,0.0,0.033394,0,0.2
1,2015-01-01 02:00:00,0,0.5106,0.026872,0.0,-0.5719,0.3558,3,0.15,0.025,...,0.708,0.127404,6,0.292,0.071033,0.0,0.0,0.12083,0,0.157895
2,2015-01-01 03:00:00,0,0.8065,0.397575,0.3919,0.0,0.343283,0,0.0,0.0,...,0.687,0.134869,4,0.313,0.1875,0.2185,0.0,0.134869,0,0.137931
3,2015-01-01 04:00:00,0,0.4404,0.0396,0.0396,-0.3612,0.566817,0,0.185,0.0925,...,0.734,0.057276,2,0.266,0.133,0.133,0.0,0.18809,0,0.066667
4,2015-01-01 05:00:00,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0,0.041667


In [204]:
sentiment[sentiment["post_date"] == "2015-01-01 05:00:00"].head()

Unnamed: 0,pos,neu,neg,comp,post_date,ticker_symbol,weight,comment_num,retweet_num,like_num
22,0.0,1.0,0.0,0.0,2015-01-01 05:00:00,TSLA,1.0,0,0,0


In [205]:
agg_no_w = aggregate_simple(sentiment_no_weight)
agg_no_w = get_relative_count(agg_no_w, total_tweets)
# std is NaN when count is 1
agg_no_w = agg_no_w.fillna(0)

In [207]:
#agg_w1.to_csv(f"../datasets/v2/sentiment_weighted/w1/{company}.csv", index=False)
#agg_w2.to_csv(f"../datasets/v2/sentiment_weighted/w2/{company}.csv", index=False)

In [208]:
agg_w1.to_csv(f"../datasets/v3/more_cols/w1/{company}.csv", index=False)
agg_no_w.to_csv(f"../datasets/v3/more_cols/no_weight/{company}.csv", index=False)