In [1]:
%%capture
import matplotlib
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from datetime import timedelta, datetime, time, date
from matplotlib import rc
from tqdm.notebook import tqdm
from cycler import cycler

tqdm().pandas()

fontsize = 12
rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': fontsize})
rc('text', usetex=True)

color_list = ["#348ABD","#A60628","#7A68A6","#467821","#CF4457","#188487","#E24A33"]
style = {
  "lines.linewidth": 2.0,
  "axes.edgecolor": "#bcbcbc",
  "patch.linewidth": 0.5,
  "legend.fancybox": True,
  "axes.prop_cycle": cycler('color', color_list),
  "axes.facecolor": "#ffffff",
  "axes.labelsize": "large",
  "axes.grid": True,
  "patch.edgecolor": "#eeeeee",
  "axes.titlesize": "x-large",
  "svg.fonttype": "path"}

matplotlib.rcParams.update(style)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 30)


data_path           = 'C:/Users/Stefa/Documents/Uni/Projektassistenz/Python/Data/'
output_path         = 'C:/Users/Stefa/Documents/Uni/Projektassistenz/Auswertung/'
fin_data_path       = 'C:/Users/Stefa/Documents/Uni/Projektassistenz/Financial Data/'   
extended_model_path = data_path+'Classification/DagoBERT/SCE_Loss_minw_25_lr_5e5_3ep_bs32_wd_1e2_a0_5_b3_NN1_w2v_topics/' 


def load_data(file):
    df = pd.read_csv(file, encoding='utf-8-sig')
    df.Date = pd.to_datetime(df.Date)
    df = df.set_index(['Ticker', 'Date'])
    df = df.rename(columns={'Prediction value': 'prediction_value'})
    return df


def get_zscores(x, idiosync_z_scores, rw=int(255/2)):
    idiosync = '_idiosync' if idiosync_z_scores else ''
    
    r = x.rolling(window=rw, closed='left', min_periods=int(0.9*rw))
    m = r.mean()
    s = r.std()
    z = (x-m)/s
    z.index.name = 'Date'    
    return z


def correctTickers(x):
    """
    Rename some tickers due to recent mergers and name changes:
    ADS to BFH
    DISCA to WBD
    HFC to DINO
    LB to BBWI
    VIAC to PARA
    COG to CTRA
    """
    correctTickers = set(['ADS', 'DISCA', 'HFC', 'LB', 'VIAC', 'BRK.B', 'BF.B', 'BLL', 'COG'])
    ticker_dict = {'ADS': 'BFH', 'DISCA':'WBD', 'HFC':'DINO', 'LB':'BBWI', 'VIAC':'PARA', 'BRK.B':'BRK', 'BF.B':'BF', 'BLL':'BALL', 'COG':'CTRA'}
    if x in correctTickers:
        return ticker_dict[x]
    else:
        return x

Load Data

In [16]:
no_dupl = True                # Load news events without duplicates
return_period = 'mo_to_mc'    # mo_to_mo, mc_to_mc 
                              # mo_to_mc, mc_to_mo

consider_freshness = True
freshness = '_freshness' if consider_freshness else ''

z_cols = ['z_value_t_III', 'z_value_t_II', 'z_value_t_I', 'z_value_t', 'z_value_t_1']
r_cols = ['return_t_V', 'return_t_IV', 'return_t_III', 'return_t_II', 'return_t_I', 'return_t', 'return_t_1', 
          'return_t_2', 'return_t_3', 'return_t_4', 'return_t_5', 'return_t_6', 'return_t_7', 'return_t_8']
n_cols = []


# Load News Events Data
if no_dupl:
    news_event_df = pd.read_csv(f'./Data/news_event_df_no_dupl{freshness}.csv', encoding='utf-8', index_col=0)
    news_event_df.Date = pd.to_datetime(news_event_df.Date)
    news_event_df.Ticker = news_event_df.Ticker.apply(correctTickers)
    news_event_df = news_event_df.set_index(['Date', 'Ticker']) 
    news_event_df = news_event_df.drop(['prev_close_date'], axis=1)    
else:
    news_event_df = pd.read_csv('./Data/news_event_df.csv', encoding='utf-8')
    news_event_df.Timestamp_ET = pd.DatetimeIndex(news_event_df.Timestamp_ET)
    news_event_df.Ticker = news_event_df.Ticker.apply(correctTickers)
    news_event_df.Date = pd.to_datetime(news_event_df.Date)
    news_event_df = news_event_df.set_index(['Timestamp_ET', 'Ticker'])


# Return data of all constituents (file reference: Stock Return Calculation.ipynb)     
asset_returns = pd.read_csv(fin_data_path+f"Datastream/Data/return_{return_period}.csv", index_col=0)                 
asset_returns.index = pd.to_datetime(asset_returns.index)                                                        
asset_returns = asset_returns.replace([np.inf, -np.inf, 0.00000], np.nan)              
asset_returns = asset_returns[asset_returns.apply(lambda x: sum(x.isna()), axis=1) < 0.8*asset_returns.shape[1]] # Drop rows with more than 80% NaN values
asset_returns = asset_returns.loc[asset_returns.index >= datetime(1994,1,1)]
return_data = asset_returns.loc[news_event_df.index.get_level_values(0)[0]-timedelta(days=1) : news_event_df.index.get_level_values(0)[-1]]

# Load Beta values (file reference: Volatility adjusted labels.ipynb)
beta = pd.read_csv(fin_data_path+'beta_mc_to_mc_df_2022.csv', index_col=0)     
beta.index = pd.to_datetime(beta.index)
beta = pd.merge_asof(left=asset_returns.loc[:, []], right=beta, left_index=True, right_index=True, direction='backward')

def TickerInSP500(date):
    idx = TickerIX_W.index.get_loc(date, method='pad')
    return list(TickerIX_W.iloc[idx].loc[(TickerIX_W.iloc[idx]==1)].index.dropna())

TickerIX_W = pd.read_csv(fin_data_path+"TickerIX_W_06_2022.csv", index_col=0)
TickerIX_W.index = pd.to_datetime(TickerIX_W.index)
TickerIX_W = TickerIX_W.loc[datetime(1990,1,1):]

sp500 = pd.read_csv(fin_data_path+'SPCOMP_1990_2022.csv', sep=';', index_col=0)
sp500.index = pd.to_datetime(sp500.index) 
sp500['TOT RETURN OPEN'] = (sp500['OPENING PRICE']/sp500['PRICE INDEX'])*sp500['TOT RETURN IND']
sp500_r = sp500.pct_change()  
sp500_r['TR OpenClose'] = (sp500['TOT RETURN IND']  - sp500['TOT RETURN OPEN'])/sp500['TOT RETURN OPEN']
sp500_r['TR CloseOpen'] = (sp500['TOT RETURN OPEN'] - sp500['TOT RETURN IND'].shift(1))/sp500['TOT RETURN IND'].shift(1)

rf_3M = pd.read_csv(data_path+'rf_3M.csv')
rf_3M.Date = pd.to_datetime(rf_3M.Date)
rf_3M = rf_3M.set_index('Date')
rf_3M = rf_3M.rename(columns={'Value': 'rf_rate'})
rf_3M = pd.merge_asof(left=sp500.loc[:, []], right=rf_3M, left_index=True, right_index=True, direction='backward')

In [17]:
ticker_with_return_data = set((return_data.loc[:, return_data.isna().sum(axis=0) < return_data.shape[0]]).columns)
ticker_in_sp500         = set((TickerIX_W.loc[datetime(2002,1,1):datetime(2020,2,28)].loc[:, TickerIX_W.loc[datetime(2002,1,1):datetime(2020,2,28)].sum(axis=0) > 0]).columns)

print(f"Number of unique Tickers in the News: {len(news_event_df.index.get_level_values(1).unique())}")
print(f"Number of unique Tickers since 2002:  {return_data.shape[1]}")
print(f"Number of unique Tickers with available return data within 2002 to 2020: {len(ticker_with_return_data)}")
print(f"Number of unique Tickers that were in the S&P500 within 2002 and 2020:   {len(ticker_in_sp500)}")
print(f"Number of unique Tickers with available return data that were in the S&P500 within 2002 to 2020: {len(ticker_with_return_data.intersection(ticker_in_sp500))}")

print(f"Number of overnight news events: {news_event_df.shape[0]}")
print(f"Number of daytime news events:   {109952}")  #daytime_news.shape[0]

Number of unique Tickers in the News: 812
Number of unique Tickers since 2002:  1122
Number of unique Tickers with available return data within 2002 to 2020: 967
Number of unique Tickers that were in the S&P500 within 2002 and 2020:   880
Number of unique Tickers with available return data that were in the S&P500 within 2002 to 2020: 875
Number of overnight news events: 168653
Number of daytime news events:   109952


Calculate idiosyncratic returns  
*r_idiosync = r_j - r_f - beta(j, t-1)* (r_SP500(t) - r_f)*

In [11]:
if return_period   == 'mc_to_mc':
    col = 'TOT RETURN IND'
elif return_period == 'mo_to_mo':
    col = 'TOT RETURN OPEN'
elif return_period == 'mo_to_mc':
    col = 'TR OpenClose'
elif return_period == 'mc_to_mo':
    col = 'TR CloseOpen'
    
dates = beta.index
expected_returns = beta.multiply(sp500_r.loc[dates, col] - rf_3M.loc[dates, 'rf_rate'], axis='index').add(rf_3M.loc[dates, 'rf_rate'], axis='index')
abnormal_returns_full = asset_returns.loc[dates].subtract(expected_returns, axis='index')
abnormal_returns = abnormal_returns_full.loc[news_event_df.index.get_level_values(0)[0]-timedelta(days=1) : news_event_df.index.get_level_values(0)[-1]]
#abnormal_returns.to_csv(fin_data_path+f"Datastream/Data/return_{return_period}_idiosync.csv", encoding='utf-8')    

Calculate z-scores

In [37]:
idiosync_z_scores = True
idiosync = '_idiosync' if idiosync_z_scores else ''

if idiosync_z_scores:
    z_scores = get_zscores(abnormal_returns_full, idiosync_z_scores)    
else: 
    z_scores = get_zscores(asset_returns, idiosync_z_scores)    
    
z_scores.to_csv(fin_data_path+f'z_values_{return_period}_127d_rw{idiosync}_2022.csv', encoding='utf-8', index=True)

Match z-scores with news events and asset returns

In [12]:
use_abnormal_returns  = True
idiosync_z_scores     = True

def match_events_with_returns(news_event_df, use_abnormal_returns, no_dupl, news_window=17.5, order_time='moo'): 
    #global Z, R, N, ZR, ZRN, dataset
    """
    If no_dupl = True we have no exact timestamps of the news arrival available. In this case, the index column Date is the
    date at market open. Thus, overnight news ar linked with the next market open date. 
    
    For daytime news with order_time = 'moc': no_dupl = False
    """    
    z_values = pd.read_csv(fin_data_path+f'z_values_mo_to_mc_127d_rw{idiosync}_2022.csv', encoding='utf-8', index_col=0)   
    z_values.index = pd.to_datetime(z_values.index)                                   

    if use_abnormal_returns:
        return_df = abnormal_returns
    else:
        return_df = return_data
    
    news_window_h   = int(news_window)
    news_window_min = int((news_window % 1)*60)
    
    dates = list(z_values.loc[datetime(2002,1,2):datetime(2020,1,31)].index)
    news_dates = set(news_event_df.index.get_level_values(0).unique())
    z_values_df = pd.merge(left=pd.DataFrame(index=dates), right=z_values, left_index=True, right_index=True, how='left')

    z_values  = z_values.loc[dates]
    z_values  = z_values.stack(dropna=False)
    return_df = pd.merge(left=pd.DataFrame(index=dates), right=return_df, left_index=True, right_index=True, how='left')
    
    for i, date in tqdm(enumerate(dates[:-8])):
        if i >= 5:            
            Z = z_values.loc[dates[i-1], slice(None)].to_frame(name='z_value').reset_index(level=0)
            Z = Z.loc[list(set(Z.index).intersection(set(TickerInSP500(dates[i-1]))))]     # Select only assets that are in the S&P500      
            Z.Date = date
            R = return_df.loc[dates[i-5:i+8+1], Z.index].T
            R.columns = r_cols
            
            if date in news_dates:
                if no_dupl:
                    if order_time == 'moo':
                        N = news_event_df.loc[(dates[i], slice(None)), :]
                    else:
                        print("Use oder_time='moo' or set no_dupl=False")
                        break                    
                else:                    
                    if order_time == 'moo':    # Market open order
                        rw_end   = datetime.combine(dates[i], time(9,30))
                        rw_start = rw_end - timedelta(hours=news_window_h, minutes=news_window_min)
                    elif order_time == 'moc':  # Market close order
                        rw_end   = datetime.combine(dates[i], time(16,0))
                        rw_start = rw_end - timedelta(hours=news_window_h, minutes=news_window_min)                
                        
                    N = news_event_df.loc[((rw_start <= news_event_df.index.get_level_values('Timestamp_ET')) & (news_event_df.index.get_level_values('Timestamp_ET') <= rw_end), slice(None)), :]            
            else:
                N = pd.DataFrame(columns=news_event_df.columns)
                              
            ZRN = Z.join([R, N.reset_index(level=0, drop=True)], how='left').reset_index()
            
            if i == 5:
                dataset = ZRN  
            else:
                dataset = pd.concat([dataset, ZRN], axis=0)
                      
    dataset = dataset.rename(columns={'index':'Ticker'})
    return dataset


abn           = '_abn' if use_abnormal_returns else '' 
idiosync      = '_idiosync' if idiosync_z_scores else ''
no_duplicates = '_no_dupl'  if no_dupl else ''

if return_period == 'mc_to_mc':
    new_df = match_events_with_returns(news_event_df, use_abnormal_returns, no_dupl, news_window=6.5,  order_time='moc')    # order_time: 'moc' or 'moo'
    new_df.to_csv(f'./Data/event_data{abn}_returns_mc_to_mc_6_5h_z_val{idiosync}{no_duplicates}{freshness}_'+str(2002)+'-'+str(2021)+'_v2.csv', encoding='utf-8-sig', index=False)  
    
if return_period == 'mo_to_mo':
    new_df = match_events_with_returns(news_event_df, use_abnormal_returns, no_dupl, news_window=17.5, order_time='moo')  
    new_df.to_csv(f'./Data/event_data{abn}_returns_mo_to_mo_17_5h_z_val{idiosync}{no_duplicates}{freshness}_'+str(2002)+'-'+str(2021)+'_v2.csv', encoding='utf-8-sig', index=False)      
    
if return_period == 'mo_to_mc':
    new_df = match_events_with_returns(news_event_df, use_abnormal_returns, no_dupl, news_window=17.5, order_time='moo')  
    new_df.to_csv(f'./Data/event_data{abn}_returns_mo_to_mc_17_5h_z_val{idiosync}{no_duplicates}{freshness}_'+str(2002)+'-'+str(2021)+'_v2.csv', encoding='utf-8-sig', index=False)   
    
if return_period == 'mc_to_mo':
    new_df = match_events_with_returns(news_event_df, use_abnormal_returns, no_dupl, news_window=17.5, order_time='moo')  
    new_df.to_csv(f'./Data/event_data{abn}_returns_mc_to_mo_17_5h_z_val{idiosync}{no_duplicates}{freshness}_'+str(2002)+'-'+str(2021)+'_v2.csv', encoding='utf-8-sig', index=False)

0it [00:00, ?it/s]

Combine close-to-open and open-to-close returns in one file

In [36]:
beta_o = beta.copy()
beta_c = beta.copy()
beta_o.index = beta_o.index + timedelta(hours=9, minutes=30)
beta_c.index = beta_c.index + timedelta(hours=16, minutes=0)
beta_oc = pd.concat([beta_o, beta_c]).sort_index()

sp500_close_open = sp500_r['TR CloseOpen']
sp500_open_close = sp500_r['TR OpenClose']
sp500_close_open.index = sp500_close_open.index + timedelta(hours=9, minutes=30)
sp500_open_close.index = sp500_open_close.index + timedelta(hours=16, minutes=0)
sp500_returns = pd.concat([sp500_close_open, sp500_open_close], axis=0).sort_index()
sp500_returns.name = 'TR_SP500'

rf_3M_o = rf_3M.copy()
rf_3M_c = rf_3M.copy()
rf_3M_o.index = rf_3M_o.index + timedelta(hours=9, minutes=30)
rf_3M_c.index = rf_3M_c.index + timedelta(hours=16, minutes=0)
rf_3M_oc = pd.concat([rf_3M_o, rf_3M_c]).sort_index()

R_mo_to_mc = pd.read_csv(fin_data_path+f"Datastream/Data/return_mo_to_mc.csv", index_col=0)          
R_mo_to_mc.index = pd.to_datetime(R_mo_to_mc.index)                                                        
R_mo_to_mc = R_mo_to_mc.replace([np.inf, -np.inf, 0.00000], np.nan)
R_mo_to_mc.insert(0, 'interval', 'mo_to_mc')
R_mo_to_mc.index = R_mo_to_mc.index + timedelta(hours=16)

R_mc_to_mo = pd.read_csv(fin_data_path+f"Datastream/Data/return_mc_to_mo.csv", index_col=0)          
R_mc_to_mo.index = pd.to_datetime(R_mc_to_mo.index)                                                        
R_mc_to_mo = R_mc_to_mo.replace([np.inf, -np.inf, 0.00000], np.nan)              
R_mc_to_mo.insert(0, 'interval', 'mc_to_mo')
R_mc_to_mo.index = R_mc_to_mo.index + timedelta(hours=9, minutes=30)

asset_returns = pd.concat([R_mo_to_mc, R_mc_to_mo], axis=0).sort_index()
return_data = asset_returns.loc[news_event_df.index.get_level_values(0)[0]-timedelta(days=1) : news_event_df.index.get_level_values(0)[-1]]
#R_mc_to_mo = R_mc_to_mo[R_mc_to_mo.apply(lambda x: sum(x.isna()), axis=1) < 0.8*R_mc_to_mo.shape[1]] # Drop rows with more than 80% NaN values

In [37]:
timestamps = beta_oc.index

expected_returns = beta_oc.multiply(sp500_returns[timestamps] - rf_3M_oc.loc[timestamps, 'rf_rate'], axis='index').add(rf_3M_oc.loc[timestamps, 'rf_rate'], axis='index')
abnormal_returns = asset_returns.loc[timestamps].subtract(expected_returns, axis='index')

In [38]:
def match_news_with_returns(news_event_df, use_abnormal_returns, news_window=17.5, order_time='moo'):    
    
    return_df = abnormal_returns if use_abnormal_returns else return_data
    
    news_window_h   = int(news_window)
    news_window_min = int((news_window % 1)*60)
    timestamps = list(return_data.index)
    dates = list(news_event_df.index.get_level_values(0).unique())
    print(len(dates))
    
    z_scores_oo = pd.read_csv(fin_data_path+f'z_values_mo_to_mo_127d_rw{idiosync}_2022.csv', encoding='utf-8')      
    z_scores_oo.Date = pd.to_datetime(z_scores_oo.Date)                                   
    z_scores_oo = z_scores_oo.set_index('Date')  
    z_scores_oo.index = z_scores_oo.index + timedelta(hours=9, minutes=30)
    z_scores_oo.insert(0, 'interval', 'mo_to_mo')

    z_scores_oc = pd.read_csv(fin_data_path+f'z_values_mo_to_mc_127d_rw{idiosync}_2022.csv', encoding='utf-8')      
    z_scores_oc.Date = pd.to_datetime(z_scores_oc.Date)                                   
    z_scores_oc = z_scores_oc.set_index('Date')  
    z_scores_oc.index = z_scores_oc.index + timedelta(hours=16, minutes=0)
    z_scores_oc.insert(0, 'interval', 'mo_to_mc')

    z_scores = pd.concat([z_scores_oo, z_scores_oc]).sort_index()
    z_scores = pd.merge(left=pd.DataFrame(index=timestamps), right=z_scores, left_index=True, right_index=True, how='left')
       
    for i, date in tqdm(enumerate(dates[:-8])):
        if i >= 5:
            if no_dupl:
                if order_time == 'moo':
                    df = news_event_df.loc[(dates[i], slice(None)), :]
                else:
                    print("Use oder_time='moo' or set no_dupl=False")
                    break                    
            else:   
                if order_time == 'moo': 
                    rw_end   = datetime.combine(date, time(9,30))
                    rw_start = rw_end - timedelta(hours=news_window_h, minutes=news_window_min)

                df = news_event_df.loc[((rw_start <= news_event_df.index.get_level_values('Timestamp_ET')) & 
                                         (news_event_df.index.get_level_values('Timestamp_ET') <= rw_end), slice(None)), 
                                         ['Date', 'Sentiment', 'freshness', 'topic_1', 'topic_2', 'topic_3', 'topic_4']]
            
            
            ticker = df.index.get_level_values('Ticker').unique()

            
            for j, return_j in enumerate(r_cols):
                for t in ['o', 'c']:
                    col = return_j + t
                    if t == 'o':           
                        r_j = return_df.loc[datetime.combine(dates[i-5+j], time(9,30)), ticker].rename(col)
                    elif t == 'c':
                        r_j = return_df.loc[datetime.combine(dates[i-5+j], time(16,0)), ticker].rename(col)

                    if ((j == 0) & (t == 'o')):
                        values = r_j
                    else:
                        values = pd.concat([values, r_j], axis=1)

                        
            for j, zval_j in enumerate(z_cols):
                for t in ['o', 'c']:
                    col = zval_j + t
                    if t == 'o':                     
                        z_j = z_scores.loc[datetime.combine(dates[i-3+j], time(9,30)),  ticker].rename(col)
                    elif t == 'c':
                        z_j = z_scores.loc[datetime.combine(dates[i-3+j], time(16,0)),  ticker].rename(col)

                    values = pd.concat([values, z_j], axis=1)
            
            if i == 5:
                new_df = pd.merge(df.reset_index(), values, on='Ticker', how = 'inner')
            else:
                new_df = pd.concat([new_df, pd.merge(df.reset_index(), values, on='Ticker', how = 'inner')])

    return new_df



use_abnormal_returns  = True
idiosync_z_scores     = True
abn = '_abn' if use_abnormal_returns else '' 
idiosync = '_idiosync' if idiosync_z_scores else ''
no_duplicates = '_no_dupl'  if no_dupl else ''

new_df = match_news_with_returns(news_event_df, use_abnormal_returns, news_window=17.5, order_time='moo')
new_df.to_csv(f'./Data/event_data{abn}_returns_oc_17_5h_z_val{idiosync}{no_duplicates}_'+str(2002)+'-'+str(2021)+'_v2.csv', encoding='utf-8-sig', index=False)  

4517


0it [00:00, ?it/s]

Calculate Beta values

In [3]:
# Calculate Betas
def calcBetas(X, rw=52*2):    
    x = X.loc[:, ~X.columns.isin(['SP500Return'])]
    y = X.loc[:,  X.columns.isin(['SP500Return'])]
    dates   = x.index
    beta_df = pd.DataFrame(index=dates, columns=x.columns)

    for t in tqdm(range(rw, len(dates))):
        x_rw = x.loc[dates[t-rw]:dates[t-1], :]
        y_rw = y.loc[dates[t-rw]:dates[t-1], :]
        sel_ticker = (x_rw==0).sum(axis=0) < 0.9*rw     # select only those ticker with enough available observations
        x_rw = x_rw.loc[:, sel_ticker].values
        y_rw = y_rw.values.flatten()
        x_rw_mean = (np.ones(rw)@x_rw)/rw
        y_rw_mean = (np.ones(rw)@y_rw)/rw
        cov = ((y_rw - y_rw_mean) @ (x_rw - x_rw_mean))/(rw-1)
        var = np.var(y_rw, ddof=1)
        beta = cov/var
        beta_df.loc[dates[t], sel_ticker] = beta
        
    return beta_df

# Load return data
return_data = pd.read_csv(fin_data_path+'Datastream/Data/return_mc_to_mc.csv', index_col=0)
return_data.index = pd.to_datetime(return_data.index)
return_data = return_data.reindex(sorted(return_data.columns), axis=1)

# S&P total return data
sp500 = pd.read_csv(fin_data_path+'SPCOMP_1990_2022.csv', sep=';', index_col=0)
sp500['SP500Return'] = sp500['TOT RETURN IND'].pct_change()
sp500.index = pd.to_datetime(sp500.index)

# Resample to monthly frequency
def resample_data(df, freq='W'):
    return (df+1).resample(freq, label='right', closed='right').prod()-1

X = pd.merge(left=return_data, right=sp500.SP500Return, left_index=True, right_index=True, how='inner')
X = resample_data(X)

beta_df = calcBetas(X, rw=52*2)
beta_df.to_csv(fin_data_path+'beta_mc_to_mc_df_2022.csv', index=True)

  0%|          | 0/1589 [00:00<?, ?it/s]

Calculate close-to-open return

In [None]:
tot_return_open  = pd.read_csv(fin_data_path+'Datastream/Data/tot_return_open_index.csv', low_memory=False)
tot_return_close = pd.read_csv(fin_data_path+'Datastream/Data/tot_return_close_index.csv', low_memory=False)
tot_return_open['Date']  = pd.to_datetime(tot_return_open['Date']) 
tot_return_close['Date'] = pd.to_datetime(tot_return_close['Date']) 
tot_return_open  = tot_return_open.set_index('Date')
tot_return_close = tot_return_close.set_index('Date')
return_close_to_open = (tot_return_open - tot_return_close.shift(periods=1))/tot_return_close.shift(periods=1)
return_close_to_open.to_csv(fin_data_path+'Datastream/Data/return_mc_to_mo.csv', encoding='utf-8', index=True)

Calculate open-to-close return

In [None]:
tot_return_open  = pd.read_csv(fin_data_path+'Datastream/Data/tot_return_open_index.csv', low_memory=False)
tot_return_close = pd.read_csv(fin_data_path+'Datastream/Data/tot_return_close_index.csv', low_memory=False)
tot_return_open['Date']  = pd.to_datetime(tot_return_open['Date']) 
tot_return_close['Date'] = pd.to_datetime(tot_return_close['Date']) 
tot_return_open  = tot_return_open.set_index('Date')
tot_return_close = tot_return_close.set_index('Date')

return_open_to_close = (tot_return_close-tot_return_open)/tot_return_open
return_open_to_close.to_csv(fin_data_path+'Datastream/Data/return_mo_to_mc.csv', encoding='utf-8', index=True)

Creat Simplified News Events Dataframe  
*Source File: 'train_valid_data_inkl_pred_fresh_w2v_topics_2002-2021.csv'*

In [None]:
news_event_df = pd.read_csv(extended_model_path+'train_valid_data_inkl_pred_fresh_w2v_topics_'+str(2002)+'-'+str(2021)+'.csv', encoding='utf-8-sig', index_col=0)
news_event_df.Timestamp_ET = pd.DatetimeIndex(news_event_df.Timestamp_ET).tz_localize(None)
news_event_df.Date = pd.to_datetime(news_event_df.Date)
news_event_df = news_event_df.sort_values((['Timestamp_ET']),ascending=True)
news_event_df = news_event_df.set_index(['Timestamp_ET'])[:-1]
news_event_df = news_event_df.drop(['Sentiment'], axis=1)
news_event_df['Sentiment'] = news_event_df.positive-news_event_df.negative 

# Insert TradingDate column for news released during after trading hours on date t 
# to show the date t+1 of market opening on the next trading day
news_event_df['TradingDate'] = news_event_df.Date.copy()

cols = ['Date', 'TradingDate', 'Ticker', 'Sentiment', 'freshness', 'topic_1', 'topic_2', 'topic_3', 'topic_4']
news_event_df = news_event_df[cols]

trading_days   = np.array(list(return_data.index))
news_event_df  = news_event_df.loc[news_event_df.Date.isin(trading_days[:-1])]

def get_market_open_date(t):
    return trading_days[np.where(trading_days == t)[0][0] + 1]

market_open_date = news_event_df.loc[news_event_df.index.time > time(16,0), 'Date'].apply(get_market_open_date)
news_event_df.loc[news_event_df.index.time > time(16,0), 'TradingDate'] = market_open_date.copy()

news_event_df = news_event_df.reset_index()
news_event_df = news_event_df.set_index(['Timestamp_ET', 'Ticker'])

news_event_df.to_csv('./Data/news_event_df.csv', encoding='utf-8')

Remove duplicate overnight news (pre- and after midnight) from news_event_df

In [95]:
# News published between 4pm am 12am are merged into one document and news published between
# 12am an 9:30am are merged into a seperate document. Thus there can exist two seperate overnight 
# news documents of one company. This skript removes duplicates and calculates the mean of the sentiment 
# of two news articles.

# Load News Events Data
news_event_df = pd.read_csv('./Data/news_event_df.csv', encoding='utf-8')
news_event_df.Timestamp_ET = pd.DatetimeIndex(news_event_df.Timestamp_ET)
news_event_df.Date = pd.to_datetime(news_event_df.Date)
news_event_df = news_event_df.set_index(['Timestamp_ET'])

consider_freshness = True
freshness = '_freshness' if consider_freshness else ''
if consider_freshness:
    cols = ['Ticker', 'Sentiment', 'freshness', 'topic_1', 'topic_2', 'topic_3', 'topic_4']
else:
    cols  = ['Ticker', 'Sentiment', 'topic_1', 'topic_2', 'topic_3', 'topic_4']
dates = np.unique(news_event_df.index.get_level_values(0).date) 


for i in tqdm(range(1, len(dates)-1)):
    date = dates[i]
    c = datetime.combine(dates[i-1], time(16,0))
    o = datetime.combine(dates[i],   time(9,30))
    
    not_duplicated = news_event_df.loc[c:o, cols].loc[news_event_df.loc[c:o, 'Ticker'].duplicated(keep=False)==False].reset_index(drop=True)
    if consider_freshness:
        duplicated = news_event_df.loc[c:o, cols].loc[news_event_df.loc[c:o, 'Ticker'].duplicated(keep=False)].groupby(['Ticker', 'freshness']).mean().reset_index()
    else:
        duplicated = news_event_df.loc[c:o, cols].loc[news_event_df.loc[c:o, 'Ticker'].duplicated(keep=False)].groupby('Ticker').mean().reset_index()
    
    not_duplicated['Date'] = dates[i]
    duplicated['Date']     = dates[i]
    not_duplicated['prev_close_date'] = dates[i-1]
    duplicated['prev_close_date']     = dates[i-1]    
    
    if i == 1:
        news_event_df_no_dupl = pd.concat([not_duplicated, duplicated], axis=0)
    else:
        news_event_df_no_dupl = pd.concat([news_event_df_no_dupl, not_duplicated, duplicated], axis=0)
        
        
news_event_df_no_dupl = news_event_df_no_dupl.reset_index(drop=True)
news_event_df_no_dupl.to_csv(f'./Data/news_event_df_no_dupl{freshness}.csv', encoding='utf-8')

  0%|          | 0/4520 [00:00<?, ?it/s]

Merge news_event_df with close-to-open and open-to-close returns

In [None]:
news_event_df_no_dupl = load_data('./Data/news_event_df_no_dupl.csv')

df = news_event_df_no_dupl.loc[:, ['prediction_value', 'prev_close_date']]
df = df.rename(columns={'prediction_value':'sentiment'})
df = df.sort_index()
df.to_csv('./Data/news_event_df_no_dupl_minimal.csv', encoding='utf-8')

Match returns with z-score events

In [10]:
def z_scores_and_returns(z_values, use_abnormal_returns=False, idiosync_z_scores=False, exclude_conditional=False):    
    if use_abnormal_returns:
        print('Load abnormal returns')
        return_df = abnormal_returns
    else:
        print('Load total returns')
        return_df = asset_returns
        
    if exclude_conditional:
        print('Exclude conditional observations ...')
        for (ticker, date) in tqdm(test_data_pred.index.values):
            z_values.loc[date, ticker] = np.nan
        
    dates    = list(z_values.loc[datetime(2002,1,2):datetime(2020,1,31)].index)
    z_values = z_values.loc[dates]
    z_values = z_values.stack(dropna=False)
    return_df= pd.merge(left=pd.DataFrame(index=dates), right=return_df, left_index=True, right_index=True, how='left')

    
    init = True
    for i, date in tqdm(enumerate(dates[:-10])):
        if i > 4:   
            ticker_in_sp500_t = set(TickerInSP500(dates[i]))            
            Z = z_values.loc[dates[i], slice(None)].to_frame(name='z_value').reset_index(level=0)
            Z = Z.loc[list(set(Z.index).intersection(ticker_in_sp500_t))]          # Select only assets that are in the S&P500            
            R = return_df.loc[dates[i-5:i+8+1], Z.index].T
            R.columns = r_cols
            ZR = pd.merge(left=Z, right=R, left_index=True, right_index=True, how='inner')
            if init:
                z_values_returns = ZR
                init=False
            else:
                z_values_returns = pd.concat([z_values_returns, ZR], axis=0)

    # Remove rows where all returns are missing        
    z_values_returns = z_values_returns.loc[z_values_returns[r_cols].isna().sum(axis=1)!=len(r_cols)]        
    
    return z_values_returns




use_abnormal_returns = True
idiosync_z_scores    = True
exclude_conditional  = False

abn = '_abn' if use_abnormal_returns else '' 
idiosync = '_idiosync' if idiosync_z_scores else ''
excl_cond = '_excl_cond' if exclude_conditional else ''

z_scores = pd.read_csv(fin_data_path+f'z_values_mo_to_mc_127d_rw{idiosync}_2022.csv', encoding='utf-8')      
z_scores.Date = pd.to_datetime(z_scores.Date)                                   
z_scores = z_scores.set_index('Date')
        

z_values_returns = z_scores_and_returns(z_scores, 
                                        use_abnormal_returns, 
                                        idiosync_z_scores,
                                        exclude_conditional           # exclude observations where news articles were released
                                       )

# Save the Data Frame
filename = f'z_values_127d_mo_to_mc{idiosync}{abn}_returns_{return_period}{excl_cond}_'+str(2002)+'-'+str(2021)+'.csv'
z_values_returns.to_csv('./Data/'+filename, encoding='utf-8-sig', index=True)
print(filename)

Load abnormal returns


0it [00:00, ?it/s]

z_values_127d_mo_to_mc_idiosync_abn_returns_mc_to_mo_2002-2021.csv


Generate z_score_event_df file containing all z-scores together with  close-to-open and open-to-close returns

In [12]:
z_values_df_1 = pd.read_csv(f'./Data/z_values_127d_mo_to_mc_idiosync_abn_returns_mc_to_mo_2002-2021.csv', encoding='utf-8-sig', index_col=0)   
z_values_df_1.index.name = 'Ticker'
z_values_df_1.Date = pd.to_datetime(z_values_df_1.Date)
z_values_df_1 = z_values_df_1.set_index(['Date'], append=True)
z_values_df_1 = z_values_df_1.sort_index()
print(z_values_df_1.shape)

z_values_df_2 = pd.read_csv(f'./Data/z_values_127d_mo_to_mc_idiosync_abn_returns_mo_to_mc_2002-2021.csv', encoding='utf-8-sig', index_col=0)   
z_values_df_2.index.name = 'Ticker'
z_values_df_2.Date = pd.to_datetime(z_values_df_2.Date)
z_values_df_2 = z_values_df_2.set_index(['Date'], append=True)
z_values_df_2 = z_values_df_2.sort_index()
print(z_values_df_2.shape)

z_score_df = pd.concat([z_values_df_1[['z_value', 'return_t_1']], z_values_df_2[['return_t_1']]], axis=1)
z_score_df.columns = ['z_score', 'return_mc_to_mo', 'return_mo_to_mc']
z_score_df.to_csv('./Data/z_score_event_df.csv', encoding='utf-8')

(2318782, 15)
(2316109, 15)


Merge news events with z-scores

In [None]:
z_score_df = pd.read_csv('./Data/z_score_event_df.csv', encoding='utf-8')
z_score_df.Date = pd.to_datetime(z_score_df.Date)

alldates = np.unique(z_score_df.Date)
alldates = pd.DataFrame(data={'Date':alldates, 'Date_Index': np.arange(0, len(alldates))})

temp1 = pd.merge(left=z_score_df, right=alldates, on='Date', how='left')
temp1 = temp1.rename(columns={'z_score':'z_score_tp1'})
temp1 = temp1[['Ticker', 'Date_Index', 'z_score_tp1']]
temp1 = temp1.set_index(['Date_Index', 'Ticker'])

dataset = pd.merge(left=z_score_df, right=alldates, on='Date', how='left')
dataset['Date_Index_Merge'] = dataset.Date_Index+1
dataset = dataset.rename(columns={'return_mc_to_mo':'return_mc_to_mo_tp1', 'return_mo_to_mc':'return_mo_to_mc_tp1'})
dataset = pd.merge(left=dataset, right=temp1, left_on=['Date_Index_Merge', 'Ticker'], right_index=True, how='left')
dataset = dataset.drop(['Date_Index', 'Date_Index_Merge'], axis=1)

dataset.to_csv('./Data/z_score_event_df_sentiment_v2.csv', encoding='utf-8')

Daytime News

In [None]:
news_events = pd.read_csv("./Data/news_event_df.csv", encoding='utf-8')
news_events.Timestamp_ET = pd.to_datetime(news_events.Timestamp_ET)
news_events.TradingDate = pd.to_datetime(news_events.TradingDate)
news_events.Date = pd.to_datetime(news_events.Date)

daytime_news = news_events.loc[((news_events.Timestamp_ET.dt.time >= time(9,30)) & (news_events.Timestamp_ET.dt.time <= time(16,0)))==True]
daytime_news = daytime_news.loc[:, ['Ticker', 'Date', 'Sentiment']].reset_index(drop=True)

print(f"Number of daytime news events: {daytime_news.shape[0]}")
#daytime_news.to_csv('./Data/news_event_df_daytime.csv', encoding='utf-8')

S&P 500

In [None]:
col_names    = ['Total Return', 'Price Index', 'Open', 'Total Return Open', 'TR OpenClose', 'TR CloseOpen']
sp500_r_save = sp500_r.copy()
sp500_r_save.columns = col_names
sp500_r_save.to_csv("./Data/SP500_OpenCloseReturns.csv", encoding='utf-8')   

**Calculate news freshness**   
Consider news as stale if very similar news articles were published in the previous open-to-close or close-to-open period. Don't consider news as stale if they are published in the same period -> intensity score = 1 if one aricle is publised in one session, 2 if two similar articles are published in the same session, etc.

In [2]:
import torch
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from transformers import RobertaTokenizerFast, RobertaModel
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
torch.cuda.empty_cache()

data_path       = "C:/Users/Stefa/Documents/Uni/Projektassistenz/Python/Data/"
path_data_files = "F:/RTRS_News_Data/OLD_Files/filtered_news_data_for_transformer/train_validation_data/"
extended_model_path2 = data_path+"Classification/DagoBERT/SCE_Loss_minw_25_lr_5e5_6ep_bs32_wd_1e2_a0_5_b3/"

cuda:0


In [3]:
# Helper Functions
def calc_similarity(text, MAX_LEN, batch_size):
    encode = tokenizer.batch_encode_plus(
        text,                                # Sentence to encode.
        add_special_tokens = True,           # Add '[CLS]' and '[SEP]'
        max_length = MAX_LEN,                # Pad & truncate all sentences.
        truncation=True,
        padding='max_length',
        return_attention_mask = True,        # Construct attn. masks.
        return_tensors = 'pt',               # Return tensorflow tensors.
    )

    # Add the encoded sentence to the list.    
    input_ids_enc = (encode['input_ids'])
    attention_masks_enc = (encode['attention_mask'])

    # Convert the lists into Pytorch tensors.
    input_ids = input_ids_enc.clone().to(device)
    attention_masks = attention_masks_enc.clone().to(device)
    
    # Create the DataLoader for our training set.
    test_dataset = TensorDataset(input_ids, attention_masks)
    test_sampler = SequentialSampler(text)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)    
    
    with torch.no_grad(): 
        transformer_output = {}
        for i, batch in enumerate(test_dataloader):
            input_ids, attention_mask = batch[0].to(device), batch[1].to(device)
            transformer_output[i] = model(input_ids, token_type_ids=None, attention_mask=attention_mask) 
      
    # Pooled Output (CLS Token Embeddings)
    for key in transformer_output.keys():
        if key == 0:
            sentence_embeddings = transformer_output[key][1].cpu().detach().numpy()
        else:
            sentence_embeddings = np.append(sentence_embeddings, transformer_output[key][1].cpu().detach().numpy(), axis=0)

    return cosine_similarity(sentence_embeddings)


def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)


# Load DagoBERT
tokenizer = RobertaTokenizerFast.from_pretrained(data_path+"DagoBERT/tokenizer_v1/tokenizer_1996-2017") 
model     = RobertaModel.from_pretrained(extended_model_path2+"DagoBERT_1996-2017_news_input_seq_250/")    
model     = model.to(device)

Some weights of the model checkpoint at C:/Users/Stefa/Documents/Uni/Projektassistenz/Python/Data/Classification/DagoBERT/SCE_Loss_minw_25_lr_5e5_6ep_bs32_wd_1e2_a0_5_b3/DagoBERT_1996-2017_news_input_seq_250/ were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at C:/Users/Stefa/Documents/Uni/Projektassistenz/Python/Data/Classification/DagoBERT/SCE_Loss_minw_

In [4]:
news_event_df = pd.read_csv(extended_model_path+'train_valid_data_inkl_pred_fresh_w2v_topics_'+str(2002)+'-'+str(2021)+'.csv', encoding='utf-8-sig', index_col=0)
news_event_df.Timestamp_ET = pd.DatetimeIndex(news_event_df.Timestamp_ET).tz_localize(None)
news_event_df.Date = pd.to_datetime(news_event_df.Date)
news_event_df = news_event_df.sort_values((['Timestamp_ET']),ascending=True)
news_event_df = news_event_df.drop(['Sentiment'], axis=1)
news_event_df['Sentiment'] = news_event_df.positive-news_event_df.negative 
news_event_df = news_event_df.reset_index(drop=True)

In [5]:
news_event_df.head(2)

Unnamed: 0,Ticker,Date,Timestamp_ET,News,freshness,topic_1,topic_2,topic_3,topic_4,neutral,negative,positive,Prediction,Sentiment
0,SUN,2002-01-03,2002-01-03 00:09:28.021,suncorp raises mln in share plan brisbane jan ...,fresh,0.0,0.189655,0.0,0.0,0.916798,0.024531,0.058671,0,0.03414
1,SIAL,2002-01-03,2002-01-03 02:00:25.322,sia rises on nov loads bargain hunting singapo...,fresh,0.0,0.172414,0.0,0.0,0.976509,0.010941,0.01255,0,0.001609


In [None]:
# Text Similarity
cosine_threshhold  = 0.85    # Use a combination of sentence embeddings cosine distance
jaccard_threshhold = 0.95       # and Jaccard Similariry to find fresh and stale news
lookback = 3                    # use news published in the past 3 days to compare for similarity
    
news_event_df['freshness2'] = 'fresh'
dates = news_event_df['Date'].unique()
print('Iterations:', len(dates))
lookback_period = {}
time_idx = news_event_df['Timestamp_ET'].to_frame().copy()


for i, date in tqdm(enumerate(dates)):
    if i < 3:
        continue
    lookback_period['daytime']   = ((time_idx.Timestamp_ET >  pd.to_datetime(dates[i-lookback]).replace(hour=16, minute=0, second=0)) &
                                    (time_idx.Timestamp_ET <= pd.to_datetime(dates[i]).replace(hour=16, minute=0, second=0)))     

    lookback_period['overnight'] = ((time_idx.Timestamp_ET >  pd.to_datetime(dates[i-lookback]).replace(hour=9, minute=30, second=0)) &
                                    (time_idx.Timestamp_ET <= pd.to_datetime(dates[i-1]).replace(hour=16, minute=0, second=0))) 

    
    for period in lookback_period.keys():      
        lookback_idx = time_idx.loc[lookback_period[period]].index 

        if period == 'daytime':
            new_added_idx = time_idx.loc[((time_idx.Timestamp_ET >= pd.to_datetime(dates[i]).replace(hour=9, minute=30, second=0)) &
                                          (time_idx.Timestamp_ET <= pd.to_datetime(dates[i]).replace(hour=16, minute=0, second=0)))].index
        else:
            new_added_idx = time_idx.loc[((time_idx.Timestamp_ET > pd.to_datetime(dates[i-1]).replace(hour=16, minute=0, second=0)) &
                                          (time_idx.Timestamp_ET < pd.to_datetime(dates[i]).replace(hour=9, minute=30, second=0)))].index 


        batch_idx = sorted(list(set(lookback_idx).union(set(new_added_idx))))
        cosine_distances = calc_similarity(list(news_event_df.loc[batch_idx].News.values), MAX_LEN=250, batch_size=16)        


        for idx in new_added_idx:
            document = news_event_df.loc[idx, 'News'].split(' ')
            jaccard_sim = {}
            if period == 'daytime':
                query = np.arange(lookback_idx[0], (idx-1))
            else:
                query = np.array(lookback_idx)

            for query_idx in query:
                jaccard_sim[query_idx] = jaccard_similarity(news_event_df.loc[query_idx].News.split(' '), document)


            if len(jaccard_sim) >= 1:
                if max(jaccard_sim.values()) > jaccard_threshhold:

                    similar_index   = [item[0] for item in jaccard_sim.items() if item[1] > jaccard_threshhold] 
                    similar_index_i = [i for (i, index) in enumerate(batch_idx) if index in set(similar_index)]
                    jaccard_array   = np.array([jaccard_sim[x] for x in similar_index])
                    cosine_array    = cosine_distances[similar_index_i]
                    similarity      = jaccard_array*cosine_array.max(axis=1)

                    if max(similarity) > jaccard_threshhold*cosine_threshhold:
                        news_event_df.loc[idx, 'freshness2'] = 'stale'      

#news_event_df.to_csv('./Data/news_event_df_freshness_v2.csv', encoding='utf-8-sig', index=False)  

In [104]:
#news_event_df.loc[news_event_df.freshness2=='stale']

In [2]:
import math
from gensim.models.word2vec import Word2Vec
from sklearn.decomposition import PCA
from gensim.models import KeyedVectors
from importlib.machinery import SourceFileLoader
from sklearn.metrics.pairwise import cosine_similarity
Textual_Factors = SourceFileLoader("utilities.py", "C:/Users/Stefa/Documents/Uni/Projektassistenz/Paper III GUIDED TOPIC CLUSTERING/Textual_Factors/utilities.py").load_module()

np.set_printoptions(precision=3, suppress=True)
cos_similarity  = Textual_Factors.SimilarityMeasure(sim_measure='cos_similarity').calc_similarity   

In [3]:
# Load word2vec
root_dir = 'C:/Users/Stefa/Documents/Uni/Projektassistenz/Paper III GUIDED TOPIC CLUSTERING/'
w2v_path = root_dir+"Textual_Factors/models/"

pca_dim    = 63
embeddings = KeyedVectors.load(w2v_path+'w2v_cbow_64_neg_10_window_18_60_epochs_bigrams_1996_2018.word2vec')     # CBOW
vocab      = set(embeddings.wv.key_to_index.keys())
vocab_list   = list(vocab)
vocab_series = pd.Series(vocab_list)

# PCA
pca = PCA(n_components=pca_dim)
principalComponents = pca.fit_transform(embeddings.wv[vocab_list])
pca_embds = {}
for i, w in enumerate(vocab_list):
    pca_embds[w] = principalComponents[i]

# Add polarity dimension
word_polarities = pd.read_csv(root_dir+'Textual_Factors/data/word_polarities.csv', index_col=0)
word_polarities.columns = ['polarity']
polarity_words = set(word_polarities.index)
for i, w in enumerate(vocab_list):
    if w in polarity_words:
        pca_embds[w] = Textual_Factors.Unitvec(np.append(pca_embds[w], 5*max(abs(pca_embds[w]))*word_polarities.loc[w]))
    else:
        pca_embds[w] = Textual_Factors.Unitvec(np.append(pca_embds[w], 0))
        
embeddings_dict = Textual_Factors.Get_PCA_Embds(pca_embds)

In [367]:
def load_blacklist():  
    """
    The blacklist is a list of regular words that are included in company names.
    Those words are excluded for calculating the article relevance
    """
    blacklist = []
    with open('C:/Users/Stefa/Documents/Uni/Projektassistenz/DEKA/Data/blacklist.txt', 'r') as f:
        for row in f:
            blacklist.append(re.sub(r'\n', '', row.lower()))
    return set(blacklist) 


def clean_cmpy_names(tickers, names):
    drop_words = ['dead', 'delist', 'merger', 'liquidation']
    cmpy, cmpy_indices = {}, {}
    blacklist = load_blacklist()

    for i, ticker in enumerate(tickers):
        name = names[i]
        cmpy[ticker] = []
        for w in name.split()[:2]:
            w = re.sub(r"[^A-Za-z]", " ", w.lower()).strip()
            if (len(w) >= 3) & (w not in drop_words) & (w not in blacklist) & (w in vocab):
                cmpy[ticker].append(w)    
    return cmpy


fin_data = "C:/Users/Stefa/Documents/Uni/Projektassistenz/Financial Data/"
constituents = pd.read_csv(fin_data+'Datastream/Data/SP500_Constituents_06_2022.csv', encoding='utf-8', index_col=0)

all_cmpy   = clean_cmpy_names(constituents.BestTicker, constituents.NAME)
cmpy_words = list(np.unique(np.concatenate(list(all_cmpy.values()))))
cmpy_embds = embeddings_dict[cmpy_words]

In [369]:
document = 'microsoft reports surging profits'
#query    = 'so you say the car is one that is black'
#query = 'the firms profits surge'
query = 'profits of microsoft and intel surge'


#document = news_event_df.News[19]
#query    = news_event_df.News[10]
#query    = "update newmont set to win normandy as anglogold holds fire newmont sweetens normandy bid again update newmont sweetens normandy bid again new york normandy mining says recommends revisednewmont mining offer by darren schuettler and sophie hares johannesburg sydney jan newmont mining looked poised to win takeover battle for australia normandy mining with sweetened bid on thursday as "+text0

doc_vecs = embeddings_dict[[w for w in document.split(' ') if w in vocab]]
q_vecs   = embeddings_dict[[w for w in query.split(' ') if w in vocab]]

doc_len, q_len = doc_vecs.shape[0], q_vecs.shape[0]

In [372]:
def doc_similaritiy(doc_vecs, q_vecs, simCrit=0.8):
    if q_vecs.shape[0] >= doc_vecs.shape[0]:
        long_seq  = q_vecs
        short_seq = doc_vecs
        long_len  = q_vecs.shape[0]
        short_len = doc_vecs.shape[0]
    else:
        long_seq  = doc_vecs
        short_seq = q_vecs
        long_len  = doc_vecs.shape[0]  
        short_len = q_vecs.shape[0]
        
    S = cosine_similarity(short_seq, long_seq) 
    diags = [(S.diagonal(i) > simCrit).sum() for i in range(0,S.shape[1])]

    doc_in_q_ratio     = (S.max(axis=1) > simCrit).sum()/short_len    # ratio of document words contained in query document   
    longest_seq_ratio  = max(diags)/short_len                         # the longest identical sequence of words contained in both documents relative to the document length
    #longest_seq_ratio2 = max(diags)/max(long_len, short_len)         # the longest identical sequence of words contained in both documents relative to the longest document length
    #length_ratio       = short_len/long_len                          # ratio of the short sequence to the long sequence
    #return (doc_in_q_ratio, longest_seq_ratio, longest_seq_ratio2, length_ratio)
    
    # Find company mentions in the texts
    a = (cosine_similarity(cmpy_embds, long_seq)  > 0.99).sum(axis=1) >= 1
    b = (cosine_similarity(cmpy_embds, short_seq) > 0.99).sum(axis=1) >= 1
    same_firms = int((a & b).sum() >= 1)
        
    return S, doc_in_q_ratio, longest_seq_ratio, same_firms


S, doc_in_q_ratio, longest_seq_ratio, same_firms = doc_similaritiy(doc_vecs, q_vecs, simCrit=0.7)

print(doc_in_q_ratio, longest_seq_ratio, same_firms)

similarity = (same_firms*2)/((1/(doc_in_q_ratio+0.000001))+(1/(longest_seq_ratio+0.000001)))
print(similarity)

0.75 0.25 1
0.37500124999949996


In [None]:
batch_length = 30 #days
lookback     = 3  #days, use news published in the past 3 days to compare for similarity

dates = news_event_df['Date'].unique()
news_event_df['staleness'] = 0

print('Iterations:', len(dates))
V = {}



def doc_similaritiy(doc_vecs, q_vecs, simCrit=0.8):
    if q_vecs.shape[0] >= doc_vecs.shape[0]:
        long_seq  = q_vecs
        short_seq = doc_vecs
        long_len  = q_vecs.shape[0]
        short_len = doc_vecs.shape[0]
    else:
        long_seq  = doc_vecs
        short_seq = q_vecs
        long_len  = doc_vecs.shape[0]  
        short_len = q_vecs.shape[0]
        
    S = cosine_similarity(short_seq, long_seq) 
    diags = [(S.diagonal(i) > simCrit).sum() for i in range(0,S.shape[1])]

    doc_in_q_ratio     = (S.max(axis=1) > simCrit).sum()/short_len         # ratio of document words contained in query document   
    longest_seq_ratio  = max(diags)/short_len                              # the longest identical sequence of words contained in both documents relative to the document length
    a = (cosine_similarity(cmpy_embds, long_seq)  > 0.99).sum(axis=1) >= 1 # Find company mentions in the texts
    b = (cosine_similarity(cmpy_embds, short_seq) > 0.99).sum(axis=1) >= 1
    same_firms = int((a & b).sum() >= 1)
        
    similarity = (same_firms*2)/((1/(doc_in_q_ratio+0.000001))+(1/(longest_seq_ratio+0.000001)))
    
    return similarity


# To do: 
# Handle transition between batches

for t in range(0, len(dates[:10]), batch_length):
    batch_dates = dates[t:(t+batch_length)]
    X = news_event_df.loc[(news_event_df.Timestamp_ET >= batch_dates[0]) & 
                          (news_event_df.Timestamp_ET <= batch_dates[-1]), ['Timestamp_ET', 'News']]
    lookback_period = {}
    for i in X.index:
        V[i] = embeddings_dict[[w for w in X.loc[i, 'News'].split(' ') if w in vocab]]
    

    for i, date in tqdm(enumerate(batch_dates)):
        if (t==0) & (i < 3):
            continue
        lookback_period['daytime']   = ((X.Timestamp_ET >  pd.to_datetime(batch_dates[i-lookback]).replace(hour=16, minute=0, second=0)) &
                                        (X.Timestamp_ET <= pd.to_datetime(batch_dates[i]).replace(hour=16, minute=0, second=0)))     

        lookback_period['overnight'] = ((X.Timestamp_ET >  pd.to_datetime(batch_dates[i-lookback]).replace(hour=9, minute=30, second=0)) &
                                        (X.Timestamp_ET <= pd.to_datetime(batch_dates[i-1]).replace(hour=16, minute=0, second=0))) 


        for period in lookback_period.keys():      
            lookback_idx = X.loc[lookback_period[period]].index 

            if period == 'daytime':
                new_added_idx = X.loc[((X.Timestamp_ET >= pd.to_datetime(batch_dates[i]).replace(hour=9, minute=30, second=0)) &
                                       (X.Timestamp_ET <= pd.to_datetime(batch_dates[i]).replace(hour=16, minute=0, second=0)))].index
            else:
                new_added_idx = X.loc[((X.Timestamp_ET > pd.to_datetime(batch_dates[i-1]).replace(hour=16, minute=0, second=0)) &
                                       (X.Timestamp_ET < pd.to_datetime(batch_dates[i]).replace(hour=9, minute=30, second=0)))].index 

            batch_idx = sorted(list(set(lookback_idx).union(set(new_added_idx))))

            for idx in new_added_idx:
                doc_similarity = {}

                if period == 'daytime':
                    query = np.arange(lookback_idx[0], (idx-1))
                else:
                    query = np.array(lookback_idx)

                for query_idx in query:
                    doc_similarity[query_idx] = doc_similaritiy(V[idx], V[query_idx])

                news_event_df.loc[idx, 'staleness'] = max(doc_similarity.values())


                                                          
#news_event_df.to_csv('./Data/news_event_df_freshness_v2.csv', encoding='utf-8-sig', index=False)  