In [125]:
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [126]:
tweets_main = pd.read_csv(f'../datasets/tweets_full.csv')
analyzer = SentimentIntensityAnalyzer()

In [141]:
company = "GOOG"
tweets = tweets_main[tweets_main["ticker_symbol"] == company].copy().reset_index(drop=True)

In [142]:
def clean_body(col):
    import re

    # usun urle
    # usun hashtagi
    pattern_url = r"https?://[a-z.]+/[?a-z0-9./]+"
    pattern_hash = r"#"
    col = re.sub(pattern_url, "", col, flags=re.I)
    col = re.sub(pattern_hash, "", col, flags=re.I)

    return col


def convert_dict_keyval_to_col(dict, key):
    new_col = []
    for i in dict:
        new_col.append(i[key])
    return new_col


def calc_weight(tweet):
    retweets = tweet.retweet_num
    likes = tweet.like_num
    comms = tweet.comment_num
    weight = 0
    if tweet.retweet_num == 0:
        weight = 1 + 0.05 * likes + 0.2 * comms
    else:
        weight = retweets * (1 + 0.05 * likes + 0.2 * comms)
    return weight


def calc_weight_2(tweet):
    retweets = tweet.retweet_num
    likes = tweet.like_num
    comms = tweet.comment_num

    weight = 1 + 0.5 * retweets + 0.05 * likes + 0.2 * comms
    return weight


def explore_weights(df, col):
    print(df[df[col] == max(df[col])].body)
    print(tweets[col].describe(percentiles=[0.05 * (i + 1) for i in range(19)]))


def percentage_weights(df, col):
    base = round(len(df.loc[df[col] == 1]) / len(df), 3) * 100
    one_three = len(df.loc[(df[col] > 1) & (df[col] < 3)]) / len(tweets) * 100
    three_five = len(df.loc[(df[col] > 3) & (df[col] < 10)]) / len(tweets) * 100

    print("Worth 1 tweet:", round(base, 3))
    print("Worth 1 to 3 tweets:", round(one_three, 3))
    print("Worth 3 to 5 tweets:", round(three_five, 3))
    print("Total: ", round(base, 3) + round(one_three, 3) + round(three_five, 3))


In [143]:
def calc_sentiment(df, to_preserve):
    df = df.copy()
    
    sent = df["body"].apply(analyzer.polarity_scores)

    sentiment_val = pd.DataFrame()
    sentiment_val["pos"] = convert_dict_keyval_to_col(sent, "pos")
    sentiment_val["neu"] = convert_dict_keyval_to_col(sent, "neu")
    sentiment_val["neg"] = convert_dict_keyval_to_col(sent, "neg")
    sentiment_val["comp"] = convert_dict_keyval_to_col(sent, "compound")

    for col in to_preserve:
        sentiment_val[col] = df[col]

    return sentiment_val

In [144]:
tweets["weight"] = tweets.apply(calc_weight, axis=1)
tweets["weight2"] = tweets.apply(calc_weight_2, axis=1)

In [145]:
sentiment = calc_sentiment(
    tweets,
    [
        "post_date",
        "ticker_symbol",
        "weight",
        "weight2",
    ],
)


In [146]:
# round down
def round_down_hour(col):
    try:
        date, time = col.split(" ")
        time = f"{time[0:2]}:00:00"
    except:
        print(col)
        raise Exception
    return date + " " + time


def aggregate_simple(df):
    aggregated = pd.pivot_table(
        df,
        index=["post_date"],
        aggfunc=["max", "min", "std", "mean", "median", "count"],
    ).reset_index()

    aggregated.columns = [f"{i}_{j}" for i, j in aggregated.columns]
    # clean columns
    aggregated.rename(
        columns={
            f"post_date_": f"post_date",
            f"count_pos": f"count",
        },
        inplace=True,
    )

    # drop useless
    aggregated = aggregated.drop(
        [
            f"count_neg",
            f"count_neu",
            f"count_comp",
            f"min_ticker_symbol",
            f"count_ticker_symbol",
            f"max_ticker_symbol",
        ],
        axis=1,
    )

    return aggregated


def reweight(df, weight):
    cols = ["pos", "neu", "neg", "comp"]
    for col in cols:
        df[col] *= df[weight]
    return df


In [147]:
sentiment["post_date"] = sentiment["post_date"].apply(round_down_hour)

In [148]:
sentiment_w1 = reweight(sentiment.copy(), "weight")
sentiment_w1 = sentiment_w1.drop(["weight", "weight2"], axis=1)
sentiment_w1.head()


Unnamed: 0,pos,neu,neg,comp,post_date,ticker_symbol
0,0.1056,0.9944,0.0,0.22253,2015-01-01 01:00:00,GOOG
1,0.0,0.654,0.346,-0.6486,2015-01-01 01:00:00,GOOG
2,1.16955,2.28045,0.0,2.155905,2015-01-01 01:00:00,GOOG
3,0.1932,0.8568,0.0,0.569415,2015-01-01 02:00:00,GOOG
4,0.244,0.756,0.0,0.4404,2015-01-01 02:00:00,GOOG


In [149]:
sentiment_w2 = reweight(sentiment.copy(), "weight2")
sentiment_w2 = sentiment_w2.drop(["weight", "weight2"], axis=1)
sentiment_w2.head()

Unnamed: 0,pos,neu,neg,comp,post_date,ticker_symbol
0,0.1056,0.9944,0.0,0.22253,2015-01-01 01:00:00,GOOG
1,0.0,0.654,0.346,-0.6486,2015-01-01 01:00:00,GOOG
2,0.89835,1.75165,0.0,1.655985,2015-01-01 01:00:00,GOOG
3,0.1932,0.8568,0.0,0.569415,2015-01-01 02:00:00,GOOG
4,0.244,0.756,0.0,0.4404,2015-01-01 02:00:00,GOOG


In [150]:
agg_w1 = aggregate_simple(sentiment_w1)

In [151]:
agg_w1.head()

Unnamed: 0,post_date,max_comp,max_neg,max_neu,max_pos,min_comp,min_neg,min_neu,min_pos,std_comp,...,std_pos,mean_comp,mean_neg,mean_neu,mean_pos,median_comp,median_neg,median_neu,median_pos,count
0,2015-01-01 01:00:00,2.155905,0.346,2.28045,1.16955,-0.6486,0.0,0.654,0.0,1.435389,...,0.646914,0.576612,0.115333,1.309617,0.42505,0.22253,0.0,0.9944,0.1056,3
1,2015-01-01 02:00:00,0.569415,0.0,0.8568,0.244,0.4404,0.0,0.756,0.1932,0.091227,...,0.035921,0.504908,0.0,0.8064,0.2186,0.504908,0.0,0.8064,0.2186,2
2,2015-01-01 03:00:00,0.8065,0.237,1.0,0.313,-0.4767,0.0,0.687,0.0,0.622818,...,0.174551,0.273813,0.05925,0.802288,0.150962,0.382725,0.0,0.761075,0.145425,4
3,2015-01-01 04:00:00,0.0,0.231,1.0,0.0,-0.5106,0.0,0.769,0.0,0.361049,...,0.0,-0.2553,0.1155,0.8845,0.0,-0.2553,0.1155,0.8845,0.0,2
4,2015-01-01 05:00:00,0.4389,0.0,0.874,0.126,0.4389,0.0,0.874,0.126,,...,,0.4389,0.0,0.874,0.126,0.4389,0.0,0.874,0.126,1


In [152]:
agg_w2 = aggregate_simple(sentiment_w2)

In [153]:
agg_w2.head()

Unnamed: 0,post_date,max_comp,max_neg,max_neu,max_pos,min_comp,min_neg,min_neu,min_pos,std_comp,...,std_pos,mean_comp,mean_neg,mean_neu,mean_pos,median_comp,median_neg,median_neu,median_pos,count
0,2015-01-01 01:00:00,1.655985,0.346,1.75165,0.89835,-0.6486,0.0,0.654,0.0,1.16367,...,0.491026,0.409972,0.115333,1.13335,0.33465,0.22253,0.0,0.9944,0.1056,3
1,2015-01-01 02:00:00,0.569415,0.0,0.8568,0.244,0.4404,0.0,0.756,0.1932,0.091227,...,0.035921,0.504908,0.0,0.8064,0.2186,0.504908,0.0,0.8064,0.2186,2
2,2015-01-01 03:00:00,1.12995,0.237,1.5,0.42935,-0.4767,0.0,0.687,0.0,0.735245,...,0.219499,0.364938,0.05925,1.017662,0.185588,0.40325,0.0,0.941825,0.1565,4
3,2015-01-01 04:00:00,0.0,0.231,1.0,0.0,-0.5106,0.0,0.769,0.0,0.361049,...,0.0,-0.2553,0.1155,0.8845,0.0,-0.2553,0.1155,0.8845,0.0,2
4,2015-01-01 05:00:00,0.4389,0.0,0.874,0.126,0.4389,0.0,0.874,0.126,,...,,0.4389,0.0,0.874,0.126,0.4389,0.0,0.874,0.126,1


In [154]:
agg_w1.to_csv(f"../datasets/v2/sentiment_weighted/w1/{company}.csv", index=False)
agg_w2.to_csv(f"../datasets/v2/sentiment_weighted/w2/{company}.csv", index=False)