In [4]:
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [5]:
tweets_main = pd.read_csv(f'../datasets/tweets_full.csv')
analyzer = SentimentIntensityAnalyzer()

In [74]:
company = "GOOGL"
tweets = tweets_main[tweets_main["ticker_symbol"] == company]

In [75]:
tweets = tweets.sort_values("post_date")
tweets.head()

Unnamed: 0,tweet_id,post_date,body,comment_num,retweet_num,like_num,ticker_symbol
30,550447998577426433,2015-01-01 01:26:44,2014 The Year in Review (Part II - THE END) ht...,0,2,2,GOOGL
59,550461555423584257,2015-01-01 02:20:36,Prediction: $TWTR $GRPN $YELP are acquired as ...,0,0,1,GOOGL
62,550462670353494016,2015-01-01 02:25:02,Prediction: PayPal post-spinoff and $PAY are n...,0,0,0,GOOGL
72,550466497655877633,2015-01-01 02:40:14,Trailing Stop taken out on my $GOOGL #trade ta...,0,0,0,GOOGL
87,550471417754845184,2015-01-01 02:59:47,#SENTISHIFTUP $X $T $GOOGL $AMRN $UPIP $CNAT $...,0,0,0,GOOGL


In [76]:
def clean_body(col):
    import re

    # usun urle
    # usun hashtagi
    pattern_url = r"https?://[a-z.]+/[?a-z0-9./]+"
    pattern_hash = r"#"
    col = re.sub(pattern_url, "", col, flags=re.I)
    col = re.sub(pattern_hash, "", col, flags=re.I)

    return col


def convert_dict_keyval_to_col(dict, key):
    new_col = []
    for i in dict:
        new_col.append(i[key])
    return new_col


def calc_sentiment(df, other_cols):
    df = df.copy()
    sent = df["body"].apply(analyzer.polarity_scores)

    sentiment_val = pd.DataFrame()
    sentiment_val["pos"] = convert_dict_keyval_to_col(sent, "pos")
    sentiment_val["neu"] = convert_dict_keyval_to_col(sent, "neu")
    sentiment_val["neg"] = convert_dict_keyval_to_col(sent, "neg")
    sentiment_val["comp"] = convert_dict_keyval_to_col(sent, "compound")

    for col in other_cols:
        sentiment_val[col] = df[col]

    return sentiment_val

In [77]:
tweets_clean = tweets['body'].apply(clean_body)

In [78]:
tweets_has_comm = tweets[tweets["comment_num"] > 0].reset_index(drop=True)
tweets_has_rtwts = tweets[tweets["retweet_num"] > 0].reset_index(drop=True)
tweets_has_likes = tweets[tweets["like_num"] > 0].reset_index(drop=True)
tweets_has_all = tweets[
    (tweets["like_num"] > 0) & (tweets["retweet_num"] > 0) & (tweets["comment_num"] > 0)
].reset_index(drop=True)


In [79]:
is_tweet_has_all = tweets.tweet_id.isin(tweets_has_all.tweet_id)
tweets_rest = tweets[~is_tweet_has_all].reset_index()

In [80]:
assert len(tweets_rest)+len(tweets_has_all) == len(tweets)

In [81]:
tweets_has_comm.head()


Unnamed: 0,tweet_id,post_date,body,comment_num,retweet_num,like_num,ticker_symbol
0,550759232455585792,2015-01-01 22:03:28,"@downsidecapital $FB can't afford it, so $GOOG...",1,0,1,GOOGL
1,550847677227749376,2015-01-02 03:54:55,@traderstewie what's ur trade direction call o...,2,0,0,GOOGL
2,551037180706295809,2015-01-02 16:27:56,@sassyoptions $aapl $googl not trying to be be...,1,0,0,GOOGL
3,551105886337654784,2015-01-02 21:00:57,@downsidecapital @taralach From a BS perspect...,1,0,0,GOOGL
4,551211352703135744,2015-01-03 04:00:02,2015 Stocks to Love and Hate http://optionmill...,1,2,9,GOOGL


In [82]:
has_comms = calc_sentiment(tweets_has_comm, ["post_date", "ticker_symbol"])


In [83]:
has_rtwts = calc_sentiment(tweets_has_rtwts, ["post_date", "ticker_symbol"])


In [84]:
has_likes = calc_sentiment(tweets_has_likes, ["post_date", "ticker_symbol"])


In [85]:
has_all = calc_sentiment(tweets_has_all, ["post_date", "ticker_symbol"])


In [86]:
rest = calc_sentiment(tweets_rest, ["post_date", "ticker_symbol"])

## Grouping

1. Group by hour
2. Merge tables using proper naming 
3. Save to csv

In [87]:
# round down
def round_down_hour(col):
    try:
        date, time = col.split(" ")
        time = f"{time[0:2]}:00:00"
    except:
        print(col)
        raise Exception
    return date + " " + time


def aggregate(df, name):
    aggregated = pd.pivot_table(
        df,
        index=["post_date"],
        aggfunc=["max", "min", "std", "mean", "median", "count"],
    ).reset_index()

    aggregated.columns = [f"{i}_{j}_{name}" for i, j in aggregated.columns]
    # clean columns
    aggregated.rename(
        columns={
            f"post_date__{name}": f"post_date",
            f"count_pos_{name}": f"count_{name}",
        },
        inplace=True,
    )

    # drop useless
    aggregated = aggregated.drop(
        [
            f"count_neg_{name}",
            f"count_neu_{name}",
            f"count_comp_{name}",
            f"min_ticker_symbol_{name}",
            f"count_ticker_symbol_{name}",
            f"max_ticker_symbol_{name}",
        ],
        axis=1,
    )

    return aggregated


In [88]:
has_comms.post_date = has_comms.post_date.apply(round_down_hour)
has_rtwts.post_date = has_rtwts.post_date.apply(round_down_hour)
has_likes.post_date = has_likes.post_date.apply(round_down_hour)
has_all.post_date = has_all.post_date.apply(round_down_hour)
rest.post_date = rest.post_date.apply(round_down_hour)

In [89]:
comms_agg = aggregate(has_comms, "comms")
rtwts_agg = aggregate(has_rtwts, "rtwts")
likes_agg = aggregate(has_likes, "likes")
all_agg = aggregate(has_all, "all")
rest_agg = aggregate(rest, "rest")

In [90]:
merged_sentiment = comms_agg
for df in [rtwts_agg, likes_agg, all_agg, rest_agg]:
    merged_sentiment = merged_sentiment.merge(df, how="outer", on="post_date")

In [91]:
merged_sentiment.to_csv(f"../datasets/sentiment_leveled/{company}.csv", index=False)