In [1]:
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
tweets_main = pd.read_csv(f'../datasets/tweets_full.csv')
analyzer = SentimentIntensityAnalyzer()

In [243]:
company = "GOOGL"
tweets = tweets_main[tweets_main["ticker_symbol"] == company].copy().reset_index(drop=True)

In [244]:
def clean_body_v2(col):
    import re

    # usun urle
    # usun hashtagi
    # usun RT @username
    pattern_url = r"https?://[a-z0-9.]+/[?a-z0-9./-|]+"
    pattern_url_weak = r"https?://[a-z0-9.]+"
    pattern_www = r"www.[a-z0-9.]+/[?a-z0-9./-|]+"
    pattern_www_weak = r"www.[a-z0-9.]+"
    pattern_hash_dolla = r"[\$#][a-z0-9.]+"
    pattern_retweet = r"RT @[a-z0-9\S.]+"
    pattern_username = r"@[a-z0-9\S.]+"
    pattern_scraper_artifacts = r"[%-=_][\w+-\?\&|]+"

    col = col.str.replace(pattern_url, "", col, flags=re.I)
    col = col.str.replace(pattern_url_weak, "", col, flags=re.I)
    col = col.str.replace(pattern_www, "", col, flags=re.I)
    col = col.str.replace(pattern_www_weak, "", col, flags=re.I)
    col = col.str.replace(pattern_hash_dolla, "", col, flags=re.I)
    col = col.str.replace(pattern_retweet, "", col, flags=re.I)
    col = col.str.replace(pattern_username, "", col, flags=re.I)
    col = col.str.replace(pattern_scraper_artifacts, "", col, flags=re.I)
    col = col.str.replace("…", "", col, flags=re.I)


    return " ".join(col.split())

def clean_body(col):
    import re

    # usun urle
    # usun hashtagi
    # usun RT @username
    pattern_url = r"https?://[a-z.]+/[?a-z0-9./]+"
    pattern_hash_dolla = r"[\$#][a-z.]+"
    #pattern_retweet = r"RT @[a-z0-9\S.]+"
    #pattern_username = r"@[a-z0-9\S.]+"
    #pattern_scraper_artifacts = r"[%-=_][\w+-\?\&|]+"

    col = col.str.replace(pattern_url, "", col, flags=re.I)
    col = col.str.replace(pattern_hash_dolla, "", col, flags=re.I)
    #col = col.str.replace(pattern_retweet, "", col, flags=re.I)
    #col = col.str.replace(pattern_username, "", col, flags=re.I)
    #col = col.str.replace(pattern_scraper_artifacts, "", col, flags=re.I)
    #col = col.str.replace("…", "", col, flags=re.I)

    return col

def convert_dict_keyval_to_col(dict, key):
    new_col = []
    for i in dict:
        new_col.append(i[key])
    return new_col


def calc_weight(tweet):
    retweets = tweet.retweet_num
    likes = tweet.like_num
    comms = tweet.comment_num
    weight = 0
    if tweet.retweet_num == 0:
        weight = 1 + 0.05 * likes + 0.2 * comms
    else:
        weight = retweets * (1 + 0.05 * likes + 0.2 * comms)
    return weight


def calc_weight_2(tweet):
    retweets = tweet.retweet_num
    likes = tweet.like_num
    comms = tweet.comment_num

    weight = 1 + 0.5 * retweets + 0.05 * likes + 0.2 * comms
    return weight


def explore_weights(df, col):
    print(df[df[col] == max(df[col])].body)
    print(tweets[col].describe(percentiles=[0.05 * (i + 1) for i in range(19)]))


def percentage_weights(df, col):
    base = round(len(df.loc[df[col] == 1]) / len(df), 3) * 100
    one_three = len(df.loc[(df[col] > 1) & (df[col] < 3)]) / len(tweets) * 100
    three_five = len(df.loc[(df[col] > 3) & (df[col] < 10)]) / len(tweets) * 100

    print("Worth 1 tweet:", round(base, 3))
    print("Worth 1 to 3 tweets:", round(one_three, 3))
    print("Worth 3 to 5 tweets:", round(three_five, 3))
    print("Total: ", round(base, 3) + round(one_three, 3) + round(three_five, 3))


In [245]:
def clean_body_v3(col):
    import re

    # usun urle
    # usun hashtagi
    # usun RT @username
    pattern_url = r"https?://[a-z0-9.]+/[?a-z0-9./-|]+"
    pattern_url_weak = r"https?://[a-z0-9.]+"
    pattern_www = r"www.[a-z0-9.]+/[?a-z0-9./-|]+"
    pattern_www_weak = r"www.[a-z0-9.]+"
    pattern_hash_dolla = r"[\$#][a-z0-9.]+"
    pattern_retweet = r"RT @[a-z0-9\S.]+"
    pattern_username = r"@[a-z0-9\S.]+"
    pattern_scraper_artifacts = r"[%-=_][\w+-\?\&|]+"

    col = col.str.replace(re.compile(pattern_url, flags=re.I), "", regex=True)
    col = col.str.replace(re.compile(pattern_url_weak, flags=re.I), "", regex=True)
    col = col.str.replace(re.compile(pattern_www, flags=re.I), "", regex=True)
    col = col.str.replace(re.compile(pattern_www_weak, flags=re.I), "", regex=True)
    col = col.str.replace(re.compile(pattern_hash_dolla, flags=re.I), "", regex=True)
    col = col.str.replace(re.compile(pattern_retweet, flags=re.I), "", regex=True)
    col = col.str.replace(re.compile(pattern_username, flags=re.I), "", regex=True)
    col = col.str.replace(re.compile(pattern_scraper_artifacts, flags=re.I), "", regex=True)
    col = col.str.replace("…", "")

    return col.str.split().str.join(" ")

In [246]:
def calc_sentiment(df, to_preserve):
    df = df.copy()
    
    sent = df["body"].apply(analyzer.polarity_scores)

    sentiment_val = pd.DataFrame()
    sentiment_val["pos"] = convert_dict_keyval_to_col(sent, "pos")
    sentiment_val["neu"] = convert_dict_keyval_to_col(sent, "neu")
    sentiment_val["neg"] = convert_dict_keyval_to_col(sent, "neg")
    sentiment_val["comp"] = convert_dict_keyval_to_col(sent, "compound")

    for col in to_preserve:
        sentiment_val[col] = df[col]

    return sentiment_val

In [247]:
clean = tweets.copy()
clean.body = clean_body_v3(clean.body)


In [248]:
sentiment = calc_sentiment(
    tweets,
    to_preserve=[
        "post_date",
        "ticker_symbol",
        "comment_num",
        "retweet_num",
        "like_num",
    ],
)

In [249]:
sentiment["is_positive"] = 0 
sentiment.loc[sentiment["comp"] > 0.05, "is_positive"] = 1

sentiment["is_neutral"] = 0 
sentiment.loc[(sentiment["is_neutral"] > -0.05) & (sentiment["is_neutral"] < 0.05), "is_neutral"] = 1

sentiment["is_negative"] = 0 
sentiment.loc[sentiment["comp"] < -0.05, "is_negative"] = 1

In [250]:
sentiment.sample(10)

Unnamed: 0,pos,neu,neg,comp,post_date,ticker_symbol,comment_num,retweet_num,like_num,is_positive,is_neutral,is_negative
321419,0.0,0.944,0.056,-0.1531,2019-11-10 15:00:00,GOOGL,0,0,0,0,1,1
255815,0.0,0.874,0.126,-0.4374,2018-10-20 20:00:00,GOOGL,0,0,0,0,1,1
208134,0.176,0.824,0.0,0.5994,2018-03-01 19:00:00,GOOGL,0,0,0,1,1,0
64077,0.0,1.0,0.0,0.0,2015-10-02 18:00:00,GOOGL,0,0,0,0,1,0
62404,0.0,1.0,0.0,0.0,2015-09-26 20:00:00,GOOGL,0,0,0,0,1,0
137020,0.299,0.642,0.059,0.6705,2017-01-20 11:00:00,GOOGL,0,0,0,1,1,0
106536,0.197,0.803,0.0,0.4019,2016-06-20 01:00:00,GOOGL,0,4,9,1,1,0
20636,0.0,1.0,0.0,0.0,2015-04-11 18:00:00,GOOGL,0,0,0,0,1,0
39120,0.18,0.82,0.0,0.296,2015-07-01 09:00:00,GOOGL,0,0,0,1,1,0
203439,0.202,0.798,0.0,0.6199,2018-02-02 21:00:00,GOOGL,0,0,0,1,1,0


In [251]:
tweets.iloc[196214].body

'#BREAKING Slow demand of iPhone X hits $AAPL as Analysts cut iPhone shipment forecast for 2018 in U.S. and Asia #DayAfterChristmas #stocks $GS $BAC $JPM $C $WFC $MS $PHK $BLK $NVDA $GOOGL $QCOM $AVGO $DB $RY $UBS $BCS $BMO $BX #BoxingDay #WallStreet #NYC'

In [252]:
clean.iloc[196214].body

'Slow demand of iPhone X hits as Analysts cut iPhone shipment forecast for in U and Asia'

In [253]:
# round down
def round_down_hour(col):
    try:
        date, time = col.split(" ")
        time = f"{time[0:2]}:00:00"
    except:
        print(col)
        raise Exception
    return date + " " + time


def aggregate_simple(df):
    aggregated = pd.pivot_table(
        df,
        index=["post_date"],
        aggfunc={
            "pos": ["max", "min", "std", "mean", "median", "count"],
            "neu": ["max", "min", "std", "mean", "median"],
            "neg": ["max", "min", "std", "mean", "median"],
            "comp": ["max", "min", "std", "mean", "median"],
            "comment_num": ["sum"],
            "retweet_num": ["sum"],
            "like_num": ["sum"],
            "is_positive": ["sum"],
            "is_negative": ["sum"],
            "is_neutral": ["sum"],
        },
    ).reset_index()

    aggregated.columns = [f"{i}_{j}" for i, j in aggregated.columns]
    # clean columns
    aggregated.rename(
        columns={
            f"post_date_": f"post_date",
            f"pos_count": f"count",
        },
        inplace=True,
    )

    return aggregated


def reweight(df, weight):
    cols = ["pos", "neu", "neg", "comp"]
    for col in cols:
        df[col] *= df[weight]
    return df


def get_total_tweets(df):
    df["post_date"] = df["post_date"].apply(round_down_hour)
    grouped = pd.pivot_table(
        df,
        index=["post_date"],
        aggfunc=["count"],
    ).reset_index()
    grouped.columns = [f"{i}_{j}" for i, j in grouped.columns]
    grouped.rename(
        columns={
            f"post_date_": f"post_date",
            f"count_body": f"total_count",
        },
        inplace=True,
    )
    return grouped[["post_date", "total_count"]]


def get_relative_count(df, df_total):

    mrg = df.merge(df_total, how="left", on="post_date")
    df["relative_count"] = mrg["count"] / mrg["total_count"]
    return df

def normalize_polarity(df):
    summed = df["is_negative_sum"] + df["is_neutral_sum"] + df["is_positive_sum"]
    df["is_negative_sum"] /= summed
    df["is_neutral_sum"] /= summed
    df["is_positive_sum"] /= summed
    df.fillna(0, inplace=True)
    return df

In [254]:
total_tweets = get_total_tweets(tweets_main)
total_tweets.head()

Unnamed: 0,post_date,total_count
0,2015-01-01 01:00:00,50
1,2015-01-01 02:00:00,38
2,2015-01-01 03:00:00,29
3,2015-01-01 04:00:00,30
4,2015-01-01 05:00:00,24


In [255]:
sentiment["post_date"] = sentiment["post_date"].apply(round_down_hour)

In [256]:
#sentiment_w1 = reweight(sentiment.copy(), "weight")
#sentiment_w1 = sentiment_w1.drop(["weight", "weight2"], axis=1)
#sentiment_w1 = sentiment_w1.drop(["weight"], axis=1)
#sentiment_w1.head()


In [257]:
#sentiment_w2 = reweight(sentiment.copy(), "weight2")
#sentiment_w2 = sentiment_w2.drop(["weight", "weight2"], axis=1)
#sentiment_w2.head()

In [258]:
#sentiment_no_weight = sentiment.drop(["weight"], axis=1)

In [259]:
# agg_w1 = aggregate_simple(sentiment_w1)
# agg_w1 = get_relative_count(agg_w1, total_tweets)
# agg_w1 = agg_w1.fillna(0)

In [260]:
# agg_w1.head()

In [261]:
sentiment[sentiment["post_date"] == "2015-01-01 05:00:00"].head()

Unnamed: 0,pos,neu,neg,comp,post_date,ticker_symbol,comment_num,retweet_num,like_num,is_positive,is_neutral,is_negative
7,0.565,0.435,0.0,0.5994,2015-01-01 05:00:00,GOOGL,0,0,0,1,1,0


In [262]:
agg_no_w = aggregate_simple(sentiment)
agg_no_w = get_relative_count(agg_no_w, total_tweets)
# std is NaN when count is 1
agg_no_w = agg_no_w.fillna(0)

In [263]:
agg_no_w = normalize_polarity(agg_no_w)
agg_no_w.head()

Unnamed: 0,post_date,comment_num_sum,comp_max,comp_mean,comp_median,comp_min,comp_std,is_negative_sum,is_neutral_sum,is_positive_sum,...,neu_min,neu_std,count,pos_max,pos_mean,pos_median,pos_min,pos_std,retweet_num_sum,relative_count
0,2015-01-01 01:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0,2,0.02
1,2015-01-01 02:00:00,0,0.5423,0.0359,0.08895,-0.5766,0.466565,0.142857,0.571429,0.285714,...,0.777,0.101668,4,0.184,0.07375,0.0555,0.0,0.090223,0,0.105263
2,2015-01-01 03:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,2,0.0,0.0,0.0,0.0,0.0,0,0.068966
3,2015-01-01 05:00:00,0,0.5994,0.5994,0.5994,0.5994,0.0,0.0,0.5,0.5,...,0.435,0.0,1,0.565,0.565,0.565,0.565,0.0,0,0.041667
4,2015-01-01 07:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3,0.0,0.0,0.0,0.0,0.0,0,0.166667


In [264]:
#agg_w1.to_csv(f"../datasets/v3/more_cols/w1/{company}.csv", index=False)
agg_no_w.to_csv(f"../datasets/v3/senti/{company}.csv", index=False)