In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime

from utils.result_processing import process_dataframe

pd.set_option('display.max_rows', 150)

# Import and Format Predicted Data

In [None]:
roberta_2020_reddit = pd.read_csv('./data/roberta/2020_roberta_reddit.csv')
roberta_2020_twitter = pd.read_csv('./data/roberta/2020_roberta_twitter.csv')
roberta_2020_conventional = pd.read_csv('./data/roberta/2020_roberta_conventional.csv')
roberta_2020_crypto = pd.read_csv('./data/roberta/2020_roberta_crypto.csv')

roberta_2020_news = pd.concat([roberta_2020_conventional, roberta_2020_crypto], axis=0)

In [None]:
roberta_2020_news = process_dataframe(roberta_2020_news)
roberta_2020_twitter = process_dataframe(roberta_2020_twitter)
roberta_2020_reddit = process_dataframe(roberta_2020_reddit)

roberta_2020_news["source"] = "news"
roberta_2020_twitter["source"] = "twitter"
roberta_2020_reddit["source"] = "reddit"

roberta_2020 = pd.concat([roberta_2020_news, roberta_2020_reddit, roberta_2020_twitter], axis=0)

# Risk Scoring

## Risk Scoring Methods

### Helper Functions

In [None]:
def filter_risk(risk_df, threshold):
    # filter by threshold: only source or risk can be filtered, one at a time
    filtered_risk = []
    if threshold_all:
        for ind, row in risk_df.iterrows():
            if row["count"] < threshold_all:
                filtered_risk.append(0)
            else:
                filtered_risk.append(row["risk"])
        # update risk
        risk_df["risk"] = filtered_risk  
        
    elif threshold_risk:
        for ind, row in risk_df.iterrows():
            if row["risk_count"] < threshold_risk:
                filtered_risk.append(0)
            else:
                filtered_risk.append(row["risk"])
    
        risk_df["risk"] = filtered_risk
    
    return risk_df

In [None]:
def retrieve_counts(df):
    # get risk_count
    df["risk_count"] = df["pred"] * df["counter"]
    risk_count = df.groupby(by=["date"]).risk_count.sum()
    risk_count = pd.DataFrame(risk_count)
    risk_count = risk_count.reset_index()
    risk_count.columns = ["date", "risk_count"]
    
    # get count
    count = df.groupby(by=["date"]).counter.sum()
    count = pd.DataFrame(count)
    count = count.reset_index()
    count.columns = ["date", "count"]
    
    combined_df = pd.merge(count, risk_count, on="date")
    return combined_df

### Maximum

In [None]:
def risk_max(df, entity="", source="", threshold_all=0, threshold_risk=0):
    df_copy = df.copy(deep=True)
    
    if source:
        df_copy = df_copy[df_copy["source"]==source]
    if entity:
        df_copy = df_copy[df_copy["entity"]==entity]
        
    # get max risk score
    risk = df_copy.groupby(by=["date"]).prob.max() * 100
    
    # convert to dataframe
    risk = pd.DataFrame(risk)
    risk = risk.reset_index()
    risk.columns = ["date", "risk"]
    
    # retrieve counts
    counts = retrieve_counts(df_copy)
    
    # merge
    risk = pd.merge(risk, counts, on="date")
    
    # filter
    risk = filter_risk(risk, threshold_all=threshold_all, threshold_risk=threshold_risk)
    
    return risk

### Average

In [None]:
def risk_avg(df, entity="", source="", threshold_all=0, threshold_risk=0):
    df_copy = df.copy(deep=True)
    if source:
        df_copy = df_copy[df_copy["source"]==source]
    if entity: 
        df_copy = df_copy[df_copy["entity"]==entity]
    
    # get average risk score
    risk = df_copy.groupby(by=["date"]).prob.mean() * 100
    risk = pd.DataFrame(risk)
    risk = risk.reset_index()
    risk.columns = ["date", "risk"]
    
    # retrieve counts
    counts = retrieve_counts(df_copy)
    
    # merge
    risk = pd.merge(risk, counts, on="date")
    
    # filter
    risk = filter_risk(risk, threshold_all=threshold_all, threshold_risk=threshold_risk)
    
    return risk

### Weighted Average

In [None]:
def risk_weighted_avg(df, entity="", source="", threshold_all=0, threshold_risk=0):
    df_copy = df.copy(deep=True)
    df_copy["prob_counter"] = df_copy["prob"] * df_copy["counter"]
    
    if source:
        df_copy = df_copy[df_copy["source"]==source]
    if entity: 
        df_copy = df_copy[df_copy["entity"]==entity]
    
    # get sum of risk score
    risk = df_copy.groupby(by=["date"]).prob_counter.sum() * 100
    risk = pd.DataFrame(risk)
    risk = risk.reset_index()
    risk.columns = ["date", "risk"]
    
    # retrieve counts
    counts = retrieve_counts(df_copy)
    
    # merge
    risk = pd.merge(risk, counts, on="date")
    
    # filter
    risk = filter_risk(risk, threshold_all=threshold_all, threshold_risk=threshold_risk)
    
    risk["risk"] = risk["risk"] / risk["count"]
    
    return risk

### Relative Sources

In [None]:
def risk_relative_sources(df, entity="", weights={"news":0.5, "reddit": 0.4, "twitter":0.1}, weighted=True,
                         threshold_all=0, threshold_risk=0):
    if weighted:
        news = risk_weighted_avg(df, entity=entity, source="news")
        reddit = risk_weighted_avg(df, entity=entity, source="reddit")
        twitter = risk_weighted_avg(df, entity=entity, source="twitter")
    else:
        news = risk_avg(df, entity=entity, source="news")
        reddit = risk_avg(df, entity=entity, source="reddit")
        twitter = risk_avg(df, entity=entity, source="twitter") 
        
    # rename dataframes
    news.columns = ["date", "news", "news_count", "news_risk_count"]
    reddit.columns = ["date", "reddit", "reddit_count", "reddit_risk_count"]
    twitter.columns = ["date", "twitter", "twitter_count", "twitter_risk_count"]
    
    # combine dataframe and weight
    combined = pd.merge(news, reddit, on="date", how="outer")
    combined = pd.merge(combined, twitter, on="date", how="outer")
    combined = combined.fillna(0) # fill 0s
    combined["risk"] = weights["news"] * combined["news"] + weights["reddit"] * combined["reddit"] +\
        weights["twitter"] * combined["twitter"]
    
    combined["count"] = combined["news_count"] + combined["reddit_count"] + combined["twitter_count"]
    combined["risk_count"] = combined["news_risk_count"] + combined["reddit_risk_count"] + combined["twitter_risk_count"]
    
    # filter
    combined = filter_risk(combined, threshold_all=threshold_all, threshold_risk=threshold_risk)
    
    combined = combined[["date", "risk", "count", "risk_count"]]
    combined = combined.sort_values(by="date") # sort values
    
    return combined

In [None]:
def risk_relative_sources_split(df, entity="", weights={"news":0.5, "reddit": 0.4, "twitter":0.1}, weighted=True,
                                threshold_all={"news": 0, "reddit": 0, "twitter": 0}, 
                                threshold_risk={"news": 0, "reddit": 0, "twitter": 0}):
    if weighted:
        news = risk_weighted_avg(df, entity=entity, source="news", 
                                 threshold_all=threshold_all["news"], threshold_risk=threshold_risk["news"])
        reddit = risk_weighted_avg(df, entity=entity, source="reddit",
                                  threshold_all=threshold_all["reddit"], threshold_risk=threshold_risk["reddit"])
        twitter = risk_weighted_avg(df, entity=entity, source="twitter",
                                   threshold_all=threshold_all["twitter"], threshold_risk=threshold_risk["twitter"])
    else:
        news = risk_avg(df, entity=entity, source="news",
                       threshold_all=threshold_all["news"], threshold_risk=threshold_risk["news"])
        reddit = risk_avg(df, entity=entity, source="reddit",
                         threshold_all=threshold_all["reddit"], threshold_risk=threshold_risk["reddit"])
        twitter = risk_avg(df, entity=entity, source="twitter",
                          threshold_all=threshold_all["twitter"], threshold_risk=threshold_risk["twitter"]) 
        
    # rename dataframes
    news.columns = ["date", "news", "news_count", "news_risk_count"]
    reddit.columns = ["date", "reddit", "reddit_count", "reddit_risk_count"]
    twitter.columns = ["date", "twitter", "twitter_count", "twitter_risk_count"]
    
    # combine dataframe and weight
    combined = pd.merge(news, reddit, on="date", how="outer")
    combined = pd.merge(combined, twitter, on="date", how="outer")
    combined = combined.fillna(0) # fill 0s
    combined["risk"] = weights["news"] * combined["news"] + weights["reddit"] * combined["reddit"] +\
        weights["twitter"] * combined["twitter"]
    
    combined["count"] = combined["news_count"] + combined["reddit_count"] + combined["twitter_count"]
    combined["risk_count"] = combined["news_risk_count"] + combined["reddit_risk_count"] + combined["twitter_risk_count"]
    
    combined = combined[["date", "risk", "count", "risk_count"]]
    combined = combined.sort_values(by="date") # sort values
    
    return combined

## Decay

In [None]:
def reindex_dataframe(df, start_date, end_date):
    # create copy of dataframe
    df_copy = df.copy(deep=True)
    
    # generate new index
    date_idx = pd.date_range(start_date, end_date, freq="D")
    
    # change index
    df_copy = df_copy.set_index("date")
    
    # reindex
    df_copy = df_copy.reindex(date_idx)
    
    return df_copy

In [None]:
def linear_decay(df, decay_rate = 0.5):
    # create copy
    df_copy = df.copy(deep=True)
    
    prev_value = 0
    for index, row in df.iterrows():
        if np.isnan(row["risk"]):
            df_copy["risk"][index] = prev_value * 0.5
            prev_value = prev_value * 0.5
        else:
            prev_value = row["risk"]
    
    df_copy = df_copy.fillna(0)
    
    df_copy = df_copy.reset_index()
    df_copy.columns = ["date", "risk", "count", "risk_count"]
    
    return df_copy

In [None]:
def exp_decay(df, span = 5):
    # create copy
    df_copy = df.copy(deep=True)
    df_copy_final = df.copy(deep=True)
    
    # fill na
    df_copy = df_copy.fillna(0)
    
    # decay
    df_copy = df_copy.ewm(span=5).mean()
    
    # fill
    df_copy_final[df_copy_final.isnull()] = df_copy
    
    df_copy_final = df_copy_final.reset_index()
    df_copy_final.columns = ["date", "risk", "count", "risk_count"]
    
    return df_copy_final

## Evaluation of Methods

In [None]:
def risk_graph(df, start_date, end_date, title="Risk Score Over Time", entity="", show_hacks=True):
    '''
    start_date/end_date in foramt: 'YYYY-MM-DD'
    '''
    start_date = datetime.strptime(start_date, "%Y-%m-%d").date()
    end_date = datetime.strptime(end_date, "%Y-%m-%d").date()
    
    filtered_df = df[df["date"].apply(lambda x: x <= end_date and x >= start_date)]
    
    # graph
    plt.figure(figsize=(15,5))
    
    if show_hacks:
        # retrieve hacks list
        hacks_list = pd.read_csv('./data/hacks_list_2020.csv')
        hacks_list['Date'] = pd.to_datetime(hacks_list.Date, format="%d-%m-%Y")
        hacks_list['Entity'] = hacks_list['Entity'].apply(lambda x: x.lower())
        
        if entity:
            hacks_list = hacks_list[hacks_list["Entity"] == entity]
        
        for hack_date in hacks_list.Date:
            plt.axvline(hack_date, linestyle="--", color="red")
    
    plt.plot(filtered_df.date, filtered_df.risk, color="black")
    
    plt.grid(True)
    plt.xlabel('Date')
    plt.ylabel('Risk Score')
    plt.title(title)
    plt.show()

# Entity Analysis

In [None]:
start_date = "2020-01-01"
end_date = "2020-06-30"

## Binance

In [None]:
entity = "binance"

### Maximum Risk

#### Threshold: None

In [None]:
risk_max_overall = risk_max(roberta_2020, entity=entity)
risk_graph(risk_max_overall, start_date=start_date, end_date=end_date, title="Overall: Max Probability", entity=entity)

In [None]:
risk_max_news = risk_max(roberta_2020, source="news", entity=entity)
risk_graph(risk_max_news, start_date=start_date, end_date=end_date, title="News: Max Probability", entity=entity)

In [None]:
risk_max_reddit = risk_max(roberta_2020, source="reddit", entity=entity)
risk_graph(risk_max_reddit, start_date=start_date, end_date=end_date, title="Reddit: Max Probability", entity=entity)

In [None]:
risk_max_twitter = risk_max(roberta_2020, source="twitter", entity=entity)
risk_graph(risk_max_twitter, start_date=start_date, end_date=end_date, title="Twitter: Max Probability", entity=entity)

#### Threshold: All Count

In [None]:
risk_max_overall = risk_max(roberta_2020, entity=entity, threshold_all=10)
risk_graph(risk_max_overall, start_date=start_date, end_date=end_date, title="Overall: Max Probability", entity=entity)

In [None]:
risk_max_news = risk_max(roberta_2020, source="news", entity="binance", threshold_all=1)
risk_graph(risk_max_news, start_date=start_date, end_date=end_date, title="News: Max Probability", entity="binance")

In [None]:
risk_max_reddit = risk_max(roberta_2020, source="reddit", entity=entity, threshold_all=10)
risk_graph(risk_max_reddit, start_date=start_date, end_date=end_date, title="Reddit: Max Probability", entity=entity)

In [None]:
risk_max_twitter = risk_max(roberta_2020, source="twitter", entity=entity, threshold_all=2)
risk_graph(risk_max_twitter, start_date=start_date, end_date=end_date, title="Twitter: Max Probability", entity=entity)

#### Threshold: Risk Count

In [None]:
risk_max_overall = risk_max(roberta_2020, entity=entity, threshold_risk=4)
risk_graph(risk_max_overall, start_date=start_date, end_date=end_date, title="Overall: Max Probability", entity=entity)

In [None]:
risk_max_news = risk_max(roberta_2020, source="news", entity=entity, threshold_risk=1)
risk_graph(risk_max_news, start_date=start_date, end_date=end_date, title="News: Max Probability", entity=entity)

In [None]:
risk_max_reddit = risk_max(roberta_2020, source="reddit", entity=entity, threshold_risk=3)
risk_graph(risk_max_reddit, start_date=start_date, end_date=end_date, title="Reddit: Max Probability", entity=entity)

In [None]:
risk_max_twitter = risk_max(roberta_2020, source="twitter", entity=entity, threshold_risk=1)
risk_graph(risk_max_twitter, start_date=start_date, end_date=end_date, title="Twitter: Max Probability", entity=entity)

### Average Risk

#### Threshold: None

In [None]:
risk_avg_overall = risk_avg(roberta_2020, entity=entity) 
risk_graph(risk_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Average Probability", entity=entity)

In [None]:
risk_avg_news = risk_avg(roberta_2020, source="news", entity=entity) 
risk_graph(risk_avg_news, start_date=start_date, end_date=end_date, title="News: Average Probability", entity=entity)

In [None]:
risk_avg_reddit = risk_avg(roberta_2020, source="reddit", entity=entity) 
risk_graph(risk_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Average Probability", entity=entity)

In [None]:
risk_avg_twitter = risk_avg(roberta_2020, source="twitter", entity=entity) 
risk_graph(risk_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Average Probability", entity=entity)

#### Threshold: All Count

In [None]:
risk_avg_overall = risk_avg(roberta_2020, entity=entity, threshold_all=10) 
risk_graph(risk_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Average Probability", entity=entity)

In [None]:
risk_avg_news = risk_avg(roberta_2020, source="news", entity=entity, threshold_all=1) 
risk_graph(risk_avg_news, start_date=start_date, end_date=end_date, title="News: Average Probability", entity=entity)

In [None]:
risk_avg_reddit = risk_avg(roberta_2020, source="reddit", entity=entity, threshold_all=10) 
risk_graph(risk_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Average Probability", entity=entity)

In [None]:
risk_avg_twitter = risk_avg(roberta_2020, source="twitter", entity=entity, threshold_all=1) 
risk_graph(risk_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Average Probability", entity=entity)

#### Threshold: Risk Count

In [None]:
risk_avg_overall = risk_avg(roberta_2020, entity=entity, threshold_risk=2) 
risk_graph(risk_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Average Probability", entity=entity)

In [None]:
risk_avg_news = risk_avg(roberta_2020, source="news", entity=entity, threshold_risk=1) 
risk_graph(risk_avg_news, start_date=start_date, end_date=end_date, title="News: Average Probability", entity=entity)

In [None]:
risk_avg_reddit = risk_avg(roberta_2020, source="reddit", entity=entity, threshold_risk=4) 
risk_graph(risk_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Average Probability", entity=entity)

In [None]:
risk_avg_twitter = risk_avg(roberta_2020, source="twitter", entity=entity, threshold_risk=1) 
risk_graph(risk_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Average Probability", entity=entity)

### Weighted Average Risk

#### Threshold: None

In [None]:
risk_weighted_avg_overall = risk_weighted_avg(roberta_2020, entity=entity) 
risk_graph(risk_weighted_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_news = risk_weighted_avg(roberta_2020, source="news", entity=entity) 
risk_graph(risk_weighted_avg_news, start_date=start_date, end_date=end_date, title="News: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_reddit = risk_weighted_avg(roberta_2020, source="reddit", entity=entity) 
risk_graph(risk_weighted_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_twitter = risk_weighted_avg(roberta_2020, source="twitter", entity=entity) 
risk_graph(risk_weighted_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Weighted Average Probability", entity=entity)

#### Threshold: All Count

In [None]:
risk_weighted_avg_overall = risk_weighted_avg(roberta_2020, entity=entity, threshold_all=10) 
risk_graph(risk_weighted_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_news = risk_weighted_avg(roberta_2020, source="news", entity=entity, threshold_all=1) 
risk_graph(risk_weighted_avg_news, start_date=start_date, end_date=end_date, title="News: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_reddit = risk_weighted_avg(roberta_2020, source="reddit", entity=entity, threshold_all=10) 
risk_graph(risk_weighted_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_twitter = risk_weighted_avg(roberta_2020, source="twitter", entity=entity, threshold_all=2) 
risk_graph(risk_weighted_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Weighted Average Probability", entity=entity)

#### Threshold: Risk Count

In [None]:
risk_weighted_avg_overall = risk_weighted_avg(roberta_2020, entity=entity, threshold_risk=4) 
risk_graph(risk_weighted_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_news = risk_weighted_avg(roberta_2020, source="news", entity=entity, threshold_risk=1) 
risk_graph(risk_weighted_avg_news, start_date=start_date, end_date=end_date, title="News: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_reddit = risk_weighted_avg(roberta_2020, source="reddit", entity=entity, threshold_risk=3) 
risk_graph(risk_weighted_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_twitter = risk_weighted_avg(roberta_2020, source="twitter", entity=entity, threshold_risk=1) 
risk_graph(risk_weighted_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Weighted Average Probability", entity=entity)

### Relative Source - Average Risk

#### Threshold: None

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=False)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: All (Overall)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_all = 10

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=False, 
                                     threshold_all=threshold_all)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: Risk (Overall)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_risk = 5

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=False, 
                                     threshold_risk=threshold_risk)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: All (By Source)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_all = {"news":1, "reddit":10, "twitter":1}

risk_relative = risk_relative_sources_split(roberta_2020, entity=entity, weights=weights, weighted=False, 
                                     threshold_all=threshold_all)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: Risk (By Source)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_risk = {"news":1, "reddit":4, "twitter":1}

risk_relative = risk_relative_sources_split(roberta_2020, entity=entity, weights=weights, weighted=False, 
                                     threshold_risk=threshold_risk)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

### Relative Source - Weighted Average Risk

## Bitfinex

In [None]:
entity = "bitfinex"

### Maximum Risk

#### Threshold: None

In [None]:
risk_max_overall = risk_max(roberta_2020, entity=entity)
risk_graph(risk_max_overall, start_date=start_date, end_date=end_date, title="Overall: Max Probability", entity=entity)

In [None]:
risk_max_news = risk_max(roberta_2020, source="news", entity=entity)
risk_graph(risk_max_news, start_date=start_date, end_date=end_date, title="News: Max Probability", entity=entity)

In [None]:
risk_max_reddit = risk_max(roberta_2020, source="reddit", entity=entity)
risk_graph(risk_max_reddit, start_date=start_date, end_date=end_date, title="Reddit: Max Probability", entity=entity)

In [None]:
risk_max_twitter = risk_max(roberta_2020, source="twitter", entity=entity)
risk_graph(risk_max_twitter, start_date=start_date, end_date=end_date, title="Twitter: Max Probability", entity=entity)

#### Threshold: All Count

In [None]:
risk_max_overall = risk_max(roberta_2020, entity=entity, threshold_all=5)
risk_graph(risk_max_overall, start_date=start_date, end_date=end_date, title="Overall: Max Probability", entity=entity)

In [None]:
risk_max_news = risk_max(roberta_2020, source="news", entity=entity, threshold_all=1)
risk_graph(risk_max_news, start_date=start_date, end_date=end_date, title="News: Max Probability", entity=entity)

In [None]:
risk_max_reddit = risk_max(roberta_2020, source="reddit", entity=entity, threshold_all=4)
risk_graph(risk_max_reddit, start_date=start_date, end_date=end_date, title="Reddit: Max Probability", entity=entity)

In [None]:
risk_max_twitter = risk_max(roberta_2020, source="twitter", entity=entity, threshold_all=3)
risk_graph(risk_max_twitter, start_date=start_date, end_date=end_date, title="Twitter: Max Probability", entity=entity)

#### Threshold: Risk Count

In [None]:
risk_max_overall = risk_max(roberta_2020, entity=entity, threshold_risk=2)
risk_graph(risk_max_overall, start_date=start_date, end_date=end_date, title="Overall: Max Probability", entity=entity)

In [None]:
risk_max_news = risk_max(roberta_2020, source="news", entity=entity, threshold_risk=1)
risk_graph(risk_max_news, start_date=start_date, end_date=end_date, title="News: Max Probability", entity=entity)

In [None]:
risk_max_reddit = risk_max(roberta_2020, source="reddit", entity=entity, threshold_risk=1)
risk_graph(risk_max_reddit, start_date=start_date, end_date=end_date, title="Reddit: Max Probability", entity=entity)

In [None]:
risk_max_twitter = risk_max(roberta_2020, source="twitter", entity=entity, threshold_risk=2)
risk_graph(risk_max_twitter, start_date=start_date, end_date=end_date, title="Twitter: Max Probability", entity=entity)

### Average Risk

#### Threshold: None

In [None]:
risk_avg_overall = risk_avg(roberta_2020, entity=entity) 
risk_graph(risk_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Average Probability", entity=entity)

In [None]:
risk_avg_news = risk_avg(roberta_2020, source="news", entity=entity) 
risk_graph(risk_avg_news, start_date=start_date, end_date=end_date, title="News: Average Probability", entity=entity)

In [None]:
risk_avg_reddit = risk_avg(roberta_2020, source="reddit", entity=entity) 
risk_graph(risk_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Average Probability", entity=entity)

In [None]:
risk_avg_twitter = risk_avg(roberta_2020, source="twitter", entity=entity) 
risk_graph(risk_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Average Probability", entity=entity)

#### Threshold: All Count

In [None]:
risk_avg_overall = risk_avg(roberta_2020, entity=entity, threshold_all=5) 
risk_graph(risk_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Average Probability", entity=entity)

In [None]:
risk_avg_news = risk_avg(roberta_2020, source="news", entity=entity, threshold_all=1) 
risk_graph(risk_avg_news, start_date=start_date, end_date=end_date, title="News: Average Probability", entity=entity)

In [None]:
risk_avg_reddit = risk_avg(roberta_2020, source="reddit", entity=entity, threshold_all=3) 
risk_graph(risk_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Average Probability", entity=entity)

In [None]:
risk_avg_twitter = risk_avg(roberta_2020, source="twitter", entity=entity, threshold_all=2) 
risk_graph(risk_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Average Probability", entity=entity)

#### Threshold: Risk Count

In [None]:
risk_avg_overall = risk_avg(roberta_2020, entity=entity, threshold_risk=2) 
risk_graph(risk_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Average Probability", entity=entity)

In [None]:
risk_avg_news = risk_avg(roberta_2020, source="news", entity=entity, threshold_risk=1) 
risk_graph(risk_avg_news, start_date=start_date, end_date=end_date, title="News: Average Probability", entity=entity)

In [None]:
risk_avg_reddit = risk_avg(roberta_2020, source="reddit", entity=entity, threshold_risk=1) 
risk_graph(risk_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Average Probability", entity=entity)

In [None]:
risk_avg_twitter = risk_avg(roberta_2020, source="twitter", entity=entity, threshold_risk=2) 
risk_graph(risk_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Average Probability", entity=entity)

### Weighted Average Risk

#### Threshold: None

In [None]:
risk_weighted_avg_overall = risk_weighted_avg(roberta_2020, entity=entity) 
risk_graph(risk_weighted_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_news = risk_weighted_avg(roberta_2020, source="news", entity=entity) 
risk_graph(risk_weighted_avg_news, start_date=start_date, end_date=end_date, title="News: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_reddit = risk_weighted_avg(roberta_2020, source="reddit", entity=entity) 
risk_graph(risk_weighted_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_twitter = risk_weighted_avg(roberta_2020, source="twitter", entity=entity) 
risk_graph(risk_weighted_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Weighted Average Probability", entity=entity)

#### Threshold: All Count

In [None]:
risk_weighted_avg_overall = risk_weighted_avg(roberta_2020, entity=entity, threshold_all=5) 
risk_graph(risk_weighted_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_news = risk_weighted_avg(roberta_2020, source="news", entity=entity, threshold_all=1) 
risk_graph(risk_weighted_avg_news, start_date=start_date, end_date=end_date, title="News: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_reddit = risk_weighted_avg(roberta_2020, source="reddit", entity=entity, threshold_all=3) 
risk_graph(risk_weighted_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_twitter = risk_weighted_avg(roberta_2020, source="twitter", entity=entity, threshold_all=2) 
risk_graph(risk_weighted_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Weighted Average Probability", entity=entity)

#### Threshold: Risk Count

In [None]:
risk_weighted_avg_overall = risk_weighted_avg(roberta_2020, entity=entity, threshold_risk=2) 
risk_graph(risk_weighted_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_news = risk_weighted_avg(roberta_2020, source="news", entity=entity, threshold_risk=1) 
risk_graph(risk_weighted_avg_news, start_date=start_date, end_date=end_date, title="News: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_reddit = risk_weighted_avg(roberta_2020, source="reddit", entity=entity, threshold_risk=1) 
risk_graph(risk_weighted_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_twitter = risk_weighted_avg(roberta_2020, source="twitter", entity=entity, threshold_risk=1) 
risk_graph(risk_weighted_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Weighted Average Probability", entity=entity)

### Relative Source - Average Risk

#### Threshold: None

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=False)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: All (Overall)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_all = 5

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=False, 
                                     threshold_all=threshold_all)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: Risk (Overall)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_risk = 1

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=False, 
                                     threshold_risk=threshold_risk)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: All (By Source)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_all = {"news":1, "reddit":3, "twitter":1}

risk_relative = risk_relative_sources_split(roberta_2020, entity=entity, weights=weights, weighted=False, 
                                     threshold_all=threshold_all)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: Risk (By Source)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_risk = {"news":1, "reddit":3, "twitter":1}

risk_relative = risk_relative_sources_split(roberta_2020, entity=entity, weights=weights, weighted=False, 
                                     threshold_risk=threshold_risk)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

### Relative Source - Weighted Average Risk

#### Threshold: None

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=True)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: All (Overall)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_all = 5

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=True, 
                                     threshold_all=threshold_all)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: Risk (Overall)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_risk = 2

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=True, 
                                     threshold_risk=threshold_risk)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: All (By Source)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_all = {"news":1, "reddit":2, "twitter":1}

risk_relative = risk_relative_sources_split(roberta_2020, entity=entity, weights=weights, weighted=True, 
                                     threshold_all=threshold_all)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: Risk (By Source)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_risk = {"news":1, "reddit":2, "twitter":1}

risk_relative = risk_relative_sources_split(roberta_2020, entity=entity, weights=weights, weighted=True, 
                                     threshold_risk=threshold_risk)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: None

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=True)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: All (Overall)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_all = 5

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=True, 
                                     threshold_all=threshold_all)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: Risk (Overall)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_risk = 2

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=True, 
                                     threshold_risk=threshold_risk)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: All (By Source)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_all = {"news":1, "reddit":3, "twitter":1}

risk_relative = risk_relative_sources_split(roberta_2020, entity=entity, weights=weights, weighted=True, 
                                     threshold_all=threshold_all)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: Risk (By Source)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_risk = {"news":1, "reddit":3, "twitter":1}

risk_relative = risk_relative_sources_split(roberta_2020, entity=entity, weights=weights, weighted=True, 
                                     threshold_risk=threshold_risk)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

## OKEx

In [None]:
entity = "okex"

### Maximum Risk

#### Threshold: None

In [None]:
risk_max_overall = risk_max(roberta_2020, entity=entity)
risk_graph(risk_max_overall, start_date=start_date, end_date=end_date, title="Overall: Max Probability", entity=entity)

In [None]:
# NO NEWS
# risk_max_news = risk_max(roberta_2020, source="news", entity=entity)
# risk_graph(risk_max_news, start_date=start_date, end_date=end_date, title="News: Max Probability", entity=entity)

In [None]:
risk_max_reddit = risk_max(roberta_2020, source="reddit", entity=entity)
risk_graph(risk_max_reddit, start_date=start_date, end_date=end_date, title="Reddit: Max Probability", entity=entity)

In [None]:
risk_max_twitter = risk_max(roberta_2020, source="twitter", entity=entity)
risk_graph(risk_max_twitter, start_date=start_date, end_date=end_date, title="Twitter: Max Probability", entity=entity)

#### Threshold: All Count

In [None]:
risk_max_overall = risk_max(roberta_2020, entity=entity, threshold_all=1)
risk_graph(risk_max_overall, start_date=start_date, end_date=end_date, title="Overall: Max Probability", entity=entity)

In [None]:
# risk_max_news = risk_max(roberta_2020, source="news", entity=entity, threshold_all=1)
# risk_graph(risk_max_news, start_date=start_date, end_date=end_date, title="News: Max Probability", entity=entity)

In [None]:
risk_max_reddit = risk_max(roberta_2020, source="reddit", entity=entity, threshold_all=1)
risk_graph(risk_max_reddit, start_date=start_date, end_date=end_date, title="Reddit: Max Probability", entity=entity)

In [None]:
risk_max_twitter = risk_max(roberta_2020, source="twitter", entity=entity, threshold_all=3)
risk_graph(risk_max_twitter, start_date=start_date, end_date=end_date, title="Twitter: Max Probability", entity=entity)

#### Threshold: Risk Count

In [None]:
risk_max_overall = risk_max(roberta_2020, entity=entity, threshold_risk=1)
risk_graph(risk_max_overall, start_date=start_date, end_date=end_date, title="Overall: Max Probability", entity=entity)

In [None]:
# risk_max_news = risk_max(roberta_2020, source="news", entity=entity, threshold_risk=0)
# risk_graph(risk_max_news, start_date=start_date, end_date=end_date, title="News: Max Probability", entity=entity)

In [None]:
risk_max_reddit = risk_max(roberta_2020, source="reddit", entity=entity, threshold_risk=1)
risk_graph(risk_max_reddit, start_date=start_date, end_date=end_date, title="Reddit: Max Probability", entity=entity)

In [None]:
risk_max_twitter = risk_max(roberta_2020, source="twitter", entity=entity, threshold_risk=1)
risk_graph(risk_max_twitter, start_date=start_date, end_date=end_date, title="Twitter: Max Probability", entity=entity)

### Average Risk

#### Threshold: None

In [None]:
risk_avg_overall = risk_avg(roberta_2020, entity=entity) 
risk_graph(risk_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Average Probability", entity=entity)

In [None]:
# risk_avg_news = risk_avg(roberta_2020, source="news", entity=entity) 
# risk_graph(risk_avg_news, start_date=start_date, end_date=end_date, title="News: Average Probability", entity=entity)

In [None]:
risk_avg_reddit = risk_avg(roberta_2020, source="reddit", entity=entity) 
risk_graph(risk_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Average Probability", entity=entity)

In [None]:
risk_avg_twitter = risk_avg(roberta_2020, source="twitter", entity=entity) 
risk_graph(risk_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Average Probability", entity=entity)

#### Threshold: All Count

In [None]:
risk_avg_overall = risk_avg(roberta_2020, entity=entity, threshold_all=1) 
risk_graph(risk_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Average Probability", entity=entity)

In [None]:
# risk_avg_news = risk_avg(roberta_2020, source="news", entity=entity, threshold_all=1) 
# risk_graph(risk_avg_news, start_date=start_date, end_date=end_date, title="News: Average Probability", entity=entity)

In [None]:
risk_avg_reddit = risk_avg(roberta_2020, source="reddit", entity=entity, threshold_all=1) 
risk_graph(risk_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Average Probability", entity=entity)

In [None]:
risk_avg_twitter = risk_avg(roberta_2020, source="twitter", entity=entity, threshold_all=2) 
risk_graph(risk_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Average Probability", entity=entity)

#### Threshold: Risk Count

In [None]:
risk_avg_overall = risk_avg(roberta_2020, entity=entity, threshold_risk=1) 
risk_graph(risk_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Average Probability", entity=entity)

In [None]:
# risk_avg_news = risk_avg(roberta_2020, source="news", entity=entity, threshold_risk=1) 
# risk_graph(risk_avg_news, start_date=start_date, end_date=end_date, title="News: Average Probability", entity=entity)

In [None]:
risk_avg_reddit = risk_avg(roberta_2020, source="reddit", entity=entity, threshold_risk=1) 
risk_graph(risk_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Average Probability", entity=entity)

In [None]:
risk_avg_twitter = risk_avg(roberta_2020, source="twitter", entity=entity, threshold_risk=1) 
risk_graph(risk_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Average Probability", entity=entity)

### Weighted Average Risk

#### Threshold: None

In [None]:
risk_weighted_avg_overall = risk_weighted_avg(roberta_2020, entity=entity) 
risk_graph(risk_weighted_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Weighted Average Probability", entity=entity)

In [None]:
# risk_weighted_avg_news = risk_weighted_avg(roberta_2020, source="news", entity=entity) 
# risk_graph(risk_weighted_avg_news, start_date=start_date, end_date=end_date, title="News: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_reddit = risk_weighted_avg(roberta_2020, source="reddit", entity=entity) 
risk_graph(risk_weighted_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_twitter = risk_weighted_avg(roberta_2020, source="twitter", entity=entity) 
risk_graph(risk_weighted_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Weighted Average Probability", entity=entity)

#### Threshold: All Count

In [None]:
risk_weighted_avg_overall = risk_weighted_avg(roberta_2020, entity=entity, threshold_all=2) 
risk_graph(risk_weighted_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Weighted Average Probability", entity=entity)

In [None]:
# risk_weighted_avg_news = risk_weighted_avg(roberta_2020, source="news", entity=entity, threshold_all=1) 
# risk_graph(risk_weighted_avg_news, start_date=start_date, end_date=end_date, title="News: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_reddit = risk_weighted_avg(roberta_2020, source="reddit", entity=entity, threshold_all=1) 
risk_graph(risk_weighted_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_twitter = risk_weighted_avg(roberta_2020, source="twitter", entity=entity, threshold_all=2) 
risk_graph(risk_weighted_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Weighted Average Probability", entity=entity)

#### Threshold: Risk Count

In [None]:
risk_weighted_avg_overall = risk_weighted_avg(roberta_2020, entity=entity, threshold_risk=1) 
risk_graph(risk_weighted_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Weighted Average Probability", entity=entity)

In [None]:
# risk_weighted_avg_news = risk_weighted_avg(roberta_2020, source="news", entity=entity, threshold_risk=1) 
# risk_graph(risk_weighted_avg_news, start_date=start_date, end_date=end_date, title="News: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_reddit = risk_weighted_avg(roberta_2020, source="reddit", entity=entity, threshold_risk=1) 
risk_graph(risk_weighted_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_twitter = risk_weighted_avg(roberta_2020, source="twitter", entity=entity, threshold_risk=1) 
risk_graph(risk_weighted_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Weighted Average Probability", entity=entity)

### Relative Source - Average Risk

#### Threshold: None

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=False)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: All (Overall)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_all = 1

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=False, 
                                     threshold_all=threshold_all)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: Risk (Overall)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_risk = 1

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=False, 
                                     threshold_risk=threshold_risk)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: All (By Source)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_all = {"news":0, "reddit":1, "twitter":0}

risk_relative = risk_relative_sources_split(roberta_2020, entity=entity, weights=weights, weighted=False, 
                                     threshold_all=threshold_all)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: Risk (By Source)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_risk = {"news":0, "reddit":1, "twitter":0}

risk_relative = risk_relative_sources_split(roberta_2020, entity=entity, weights=weights, weighted=False, 
                                     threshold_risk=threshold_risk)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

### Relative Source - Weighted Average Risk

#### Threshold: None

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=True)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: All (Overall)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_all = 1

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=True, 
                                     threshold_all=threshold_all)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: Risk (Overall)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_risk = 1

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=True, 
                                     threshold_risk=threshold_risk)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: All (By Source)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_all = {"news":0, "reddit":1, "twitter":1}

risk_relative = risk_relative_sources_split(roberta_2020, entity=entity, weights=weights, weighted=True, 
                                     threshold_all=threshold_all)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: Risk (By Source)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_risk = {"news":0, "reddit":1, "twitter":0}

risk_relative = risk_relative_sources_split(roberta_2020, entity=entity, weights=weights, weighted=True, 
                                     threshold_risk=threshold_risk)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

## Bancor

In [None]:
entity = "bancor"

### Maximum Risk

#### Threshold: None

In [None]:
risk_max_overall = risk_max(roberta_2020, entity=entity)
risk_graph(risk_max_overall, start_date=start_date, end_date=end_date, title="Overall: Max Probability", entity=entity)

In [None]:
risk_max_news = risk_max(roberta_2020, source="news", entity=entity)
risk_graph(risk_max_news, start_date=start_date, end_date=end_date, title="News: Max Probability", entity=entity)

In [None]:
risk_max_reddit = risk_max(roberta_2020, source="reddit", entity=entity)
risk_graph(risk_max_reddit, start_date=start_date, end_date=end_date, title="Reddit: Max Probability", entity=entity)

In [None]:
risk_max_twitter = risk_max(roberta_2020, source="twitter", entity=entity)
risk_graph(risk_max_twitter, start_date=start_date, end_date=end_date, title="Twitter: Max Probability", entity=entity)

#### Threshold: All Count

In [None]:
risk_max_overall = risk_max(roberta_2020, entity=entity, threshold_all=10)
risk_graph(risk_max_overall, start_date=start_date, end_date=end_date, title="Overall: Max Probability", entity=entity)

In [None]:
risk_max_news = risk_max(roberta_2020, source="news", entity=entity, threshold_all=1)
risk_graph(risk_max_news, start_date=start_date, end_date=end_date, title="News: Max Probability", entity=entity)

In [None]:
risk_max_reddit = risk_max(roberta_2020, source="reddit", entity=entity, threshold_all=10)
risk_graph(risk_max_reddit, start_date=start_date, end_date=end_date, title="Reddit: Max Probability", entity=entity)

In [None]:
risk_max_twitter = risk_max(roberta_2020, source="twitter", entity=entity, threshold_all=10)
risk_graph(risk_max_twitter, start_date=start_date, end_date=end_date, title="Twitter: Max Probability", entity=entity)

#### Threshold: Risk Count

In [None]:
risk_max_overall = risk_max(roberta_2020, entity=entity, threshold_risk=10)
risk_graph(risk_max_overall, start_date=start_date, end_date=end_date, title="Overall: Max Probability", entity=entity)

In [None]:
risk_max_news = risk_max(roberta_2020, source="news", entity=entity, threshold_risk=1)
risk_graph(risk_max_news, start_date=start_date, end_date=end_date, title="News: Max Probability", entity=entity)

In [None]:
risk_max_reddit = risk_max(roberta_2020, source="reddit", entity=entity, threshold_risk=10)
risk_graph(risk_max_reddit, start_date=start_date, end_date=end_date, title="Reddit: Max Probability", entity=entity)

In [None]:
risk_max_twitter = risk_max(roberta_2020, source="twitter", entity=entity, threshold_risk=5)
risk_graph(risk_max_twitter, start_date=start_date, end_date=end_date, title="Twitter: Max Probability", entity=entity)

### Average Risk

#### Threshold: None

In [None]:
risk_avg_overall = risk_avg(roberta_2020, entity=entity) 
risk_graph(risk_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Average Probability", entity=entity)

In [None]:
risk_avg_news = risk_avg(roberta_2020, source="news", entity=entity) 
risk_graph(risk_avg_news, start_date=start_date, end_date=end_date, title="News: Average Probability", entity=entity)

In [None]:
risk_avg_reddit = risk_avg(roberta_2020, source="reddit", entity=entity) 
risk_graph(risk_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Average Probability", entity=entity)

In [None]:
risk_avg_twitter = risk_avg(roberta_2020, source="twitter", entity=entity) 
risk_graph(risk_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Average Probability", entity=entity)

#### Threshold: All Count

In [None]:
risk_avg_overall = risk_avg(roberta_2020, entity=entity, threshold_all=5) 
risk_graph(risk_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Average Probability", entity=entity)

In [None]:
risk_avg_news = risk_avg(roberta_2020, source="news", entity=entity, threshold_all=1) 
risk_graph(risk_avg_news, start_date=start_date, end_date=end_date, title="News: Average Probability", entity=entity)

In [None]:
risk_avg_reddit = risk_avg(roberta_2020, source="reddit", entity=entity, threshold_all=5) 
risk_graph(risk_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Average Probability", entity=entity)

In [None]:
risk_avg_twitter = risk_avg(roberta_2020, source="twitter", entity=entity, threshold_all=5) 
risk_graph(risk_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Average Probability", entity=entity)

#### Threshold: Risk Count

In [None]:
risk_avg_overall = risk_avg(roberta_2020, entity=entity, threshold_risk=5) 
risk_graph(risk_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Average Probability", entity=entity)

In [None]:
risk_avg_news = risk_avg(roberta_2020, source="news", entity=entity, threshold_risk=1) 
risk_graph(risk_avg_news, start_date=start_date, end_date=end_date, title="News: Average Probability", entity=entity)

In [None]:
risk_avg_reddit = risk_avg(roberta_2020, source="reddit", entity=entity, threshold_risk=5) 
risk_graph(risk_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Average Probability", entity=entity)

In [None]:
risk_avg_twitter = risk_avg(roberta_2020, source="twitter", entity=entity, threshold_risk=5) 
risk_graph(risk_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Average Probability", entity=entity)

### Weighted Average Risk

#### Threshold: None

In [None]:
risk_weighted_avg_overall = risk_weighted_avg(roberta_2020, entity=entity) 
risk_graph(risk_weighted_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_news = risk_weighted_avg(roberta_2020, source="news", entity=entity) 
risk_graph(risk_weighted_avg_news, start_date=start_date, end_date=end_date, title="News: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_reddit = risk_weighted_avg(roberta_2020, source="reddit", entity=entity) 
risk_graph(risk_weighted_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_twitter = risk_weighted_avg(roberta_2020, source="twitter", entity=entity) 
risk_graph(risk_weighted_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Weighted Average Probability", entity=entity)

#### Threshold: All Count

In [None]:
risk_weighted_avg_overall = risk_weighted_avg(roberta_2020, entity=entity, threshold_all=10) 
risk_graph(risk_weighted_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_news = risk_weighted_avg(roberta_2020, source="news", entity=entity, threshold_all=1) 
risk_graph(risk_weighted_avg_news, start_date=start_date, end_date=end_date, title="News: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_reddit = risk_weighted_avg(roberta_2020, source="reddit", entity=entity, threshold_all=5) 
risk_graph(risk_weighted_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_twitter = risk_weighted_avg(roberta_2020, source="twitter", entity=entity, threshold_all=5) 
risk_graph(risk_weighted_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Weighted Average Probability", entity=entity)

#### Threshold: Risk Count

In [None]:
risk_weighted_avg_overall = risk_weighted_avg(roberta_2020, entity=entity, threshold_risk=5) 
risk_graph(risk_weighted_avg_overall, start_date=start_date, end_date=end_date, title="Overall: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_news = risk_weighted_avg(roberta_2020, source="news", entity=entity, threshold_risk=1) 
risk_graph(risk_weighted_avg_news, start_date=start_date, end_date=end_date, title="News: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_reddit = risk_weighted_avg(roberta_2020, source="reddit", entity=entity, threshold_risk=5) 
risk_graph(risk_weighted_avg_reddit, start_date=start_date, end_date=end_date, title="Reddit: Weighted Average Probability", entity=entity)

In [None]:
risk_weighted_avg_twitter = risk_weighted_avg(roberta_2020, source="twitter", entity=entity, threshold_risk=5) 
risk_graph(risk_weighted_avg_twitter, start_date=start_date, end_date=end_date, title="Twitter: Weighted Average Probability", entity=entity)

### Relative Source - Average Risk

#### Threshold: None

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=False)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: All (Overall)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_all = 5

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=False, 
                                     threshold_all=threshold_all)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: Risk (Overall)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_risk = 5

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=False, 
                                     threshold_risk=threshold_risk)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: All (By Source)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_all = {"news":1, "reddit":5, "twitter":0}

risk_relative = risk_relative_sources_split(roberta_2020, entity=entity, weights=weights, weighted=False, 
                                     threshold_all=threshold_all)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: Risk (By Source)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_risk = {"news":1, "reddit":5, "twitter":0}

risk_relative = risk_relative_sources_split(roberta_2020, entity=entity, weights=weights, weighted=False, 
                                     threshold_risk=threshold_risk)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

### Relative Source - Weighted Average Risk

#### Threshold: None

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=True)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: All (Overall)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_all = 5

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=True, 
                                     threshold_all=threshold_all)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: Risk (Overall)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_risk = 5

risk_relative = risk_relative_sources(roberta_2020, entity=entity, weights=weights, weighted=True, 
                                     threshold_risk=threshold_risk)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: All (By Source)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_all = {"news":1, "reddit":5, "twitter":0}

risk_relative = risk_relative_sources_split(roberta_2020, entity=entity, weights=weights, weighted=True, 
                                     threshold_all=threshold_all)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

#### Threshold: Risk (By Source)

In [None]:
weights={"news":0.5, "reddit": 0.25, "twitter":0.25}
threshold_risk = {"news":1, "reddit":5, "twitter":0}

risk_relative = risk_relative_sources_split(roberta_2020, entity=entity, weights=weights, weighted=True, 
                                     threshold_risk=threshold_risk)
risk_graph(risk_relative, start_date=start_date, end_date=end_date, 
           title="Relative Source Risk Over Time", entity=entity)

# Count Exploration

## Count

### Average Number of Articles (filtered for days with articles)

In [None]:
total = roberta_2020.groupby(by=["entity", "date"]).counter.sum().mean(level="entity")
total = total.reset_index()
total.columns = ["entity", "total"]

news = roberta_2020_news.groupby(by=["entity", "date"]).counter.sum().mean(level="entity")
news = news.reset_index()
news.columns = ["entity", "news"]

reddit = roberta_2020_reddit.groupby(by=["entity", "date"]).counter.sum().mean(level="entity")
reddit = reddit.reset_index()
reddit.columns = ["entity", "reddit"]

twitter = roberta_2020_twitter.groupby(by=["entity", "date"]).counter.sum().mean(level="entity")
twitter = twitter.reset_index()
twitter.columns = ["entity", "twitter"]

combined = pd.merge(total, news, on="entity", how="outer")
combined = pd.merge(combined, reddit, on="entity", how="outer")
combined = pd.merge(combined, twitter, on="entity", how="outer")
combined = combined.fillna(0)

combined.to_csv("data/data_exploration/avg_count_filtered.csv")

combined

### Average Number of Articles (unfiltered days)

In [None]:
start_date_datetime = datetime(2020, 1, 1)
end_date_datetime = datetime(2020, 6, 30)
delta = (end_date_datetime - start_date_datetime).days

In [None]:
total = roberta_2020.groupby(by=["entity", "date"]).counter.sum().sum(level="entity") / delta
total = total.reset_index()
total.columns = ["entity", "total"]

news = roberta_2020_news.groupby(by=["entity", "date"]).counter.sum().sum(level="entity") / delta
news = news.reset_index()
news.columns = ["entity", "news"]

reddit = roberta_2020_reddit.groupby(by=["entity", "date"]).counter.sum().sum(level="entity") / delta
reddit = reddit.reset_index()
reddit.columns = ["entity", "reddit"]

twitter = roberta_2020_twitter.groupby(by=["entity", "date"]).counter.sum().sum(level="entity") / delta
twitter = twitter.reset_index()
twitter.columns = ["entity", "twitter"]

combined = pd.merge(total, news, on="entity", how="outer")
combined = pd.merge(combined, reddit, on="entity", how="outer")
combined = pd.merge(combined, twitter, on="entity", how="outer")
combined = combined.fillna(0)

combined.to_csv("data/data_exploration/avg_count.csv")

combined

## Risk Count

### Average Number of Risky Articles (filtered for days with risky articles)

In [None]:
total = roberta_2020[roberta_2020["pred"] == 1].groupby(by=["entity", "date"]).counter.sum().mean(level="entity")
total = total.reset_index()
total.columns = ["entity", "total"]

news = roberta_2020_news[roberta_2020_news["pred"] == 1].groupby(
    by=["entity", "date"]).counter.sum().mean(level="entity")
news = news.reset_index()
news.columns = ["entity", "news"]

reddit = roberta_2020_reddit[roberta_2020_reddit["pred"] == 1].groupby(
    by=["entity", "date"]).counter.sum().mean(level="entity")
reddit = reddit.reset_index()
reddit.columns = ["entity", "reddit"]

twitter = roberta_2020_twitter[roberta_2020_twitter["pred"] == 1].groupby(
    by=["entity", "date"]).counter.sum().mean(level="entity")
twitter = twitter.reset_index()
twitter.columns = ["entity", "twitter"]

combined = pd.merge(total, news, on="entity", how="outer")
combined = pd.merge(combined, reddit, on="entity", how="outer")
combined = pd.merge(combined, twitter, on="entity", how="outer")
combined = combined.fillna(0)

combined.to_csv("data/data_exploration/avg_risk_count_filtered.csv")

combined

### Average Number of Risky Articles (unfiltered)¶

In [None]:
total = roberta_2020[roberta_2020["pred"] == 1].groupby(by=["entity", "date"]).counter.sum().sum(level="entity") / delta
total = total.reset_index()
total.columns = ["entity", "total"]

news = roberta_2020_news[roberta_2020_news["pred"] == 1].groupby(
    by=["entity", "date"]).counter.sum().sum(level="entity") / delta
news = news.reset_index()
news.columns = ["entity", "news"]

reddit = roberta_2020_reddit[roberta_2020_reddit["pred"] == 1].groupby(
    by=["entity", "date"]).counter.sum().sum(level="entity") / delta
reddit = reddit.reset_index()
reddit.columns = ["entity", "reddit"]

twitter = roberta_2020_twitter[roberta_2020_twitter["pred"] == 1].groupby(
    by=["entity", "date"]).counter.sum().sum(level="entity") / delta
twitter = twitter.reset_index()
twitter.columns = ["entity", "twitter"]

combined = pd.merge(total, news, on="entity", how="outer")
combined = pd.merge(combined, reddit, on="entity", how="outer")
combined = pd.merge(combined, twitter, on="entity", how="outer")
combined = combined.fillna(0)

combined.to_csv("data/data_exploration/avg_risk_count.csv")

combined

# Decay - Example

## No Decay

In [None]:
risk_weighted_avg_twitter = risk_weighted_avg(roberta_2020, source="twitter", entity="binance") 
reindexed = reindex_dataframe(risk_weighted_avg_twitter, start_date=start_date, end_date=end_date)
reindexed = reindexed.reset_index()
reindexed.columns = ["date", "risk", "count", "risk_count"]
reindexed = reindexed.fillna(0)
risk_graph(reindexed, start_date=start_date, end_date=end_date, title="Twitter: Weighted Average Probability", entity="binance")

## Linear Decay

In [None]:
risk_weighted_avg_twitter = risk_weighted_avg(roberta_2020, source="twitter", entity="binance") 
df_copy = reindex_dataframe(risk_weighted_avg_twitter, start_date=start_date, end_date=end_date)

In [None]:
risk_graph(linear_decay(df_copy, decay_rate=0.05), start_date, end_date, entity="binance")

## Exponential Decay

In [None]:
risk_graph(exp_decay(df_copy, span=10), start_date, end_date, entity="binance")