# **0**. Imports

In [None]:
ON_COLAB=False
if ON_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    BASEDIR='/content/drive/My Drive/MentalHealthShared/'
    PYTHONDIR=BASEDIR+'src'
    RESULTSDIR=BASEDIR+'results/'
    MODELSDIR=BASEDIR+'model/'
    DATADIR=BASEDIR+'data/'
    import models
    import pytorchtools
    from utils import createTensorDataset
    from training_functions import load_df, compute_bin_weights, save_stats_tensors, load_stats_tensors, get_znorm_params, get_subreddit_range, split_indices, get_subreddit_weights
else:
    import os
    BASEDIR = os.getcwd() + "/"
    dirs = ["results","model","data"]
    for dirc in dirs:
        if dirc not in os.listdir(): 
            os.makedirs(os.path.join(BASEDIR,dirc))
    PYTHONDIR=BASEDIR+'src/'
    RESULTSDIR=BASEDIR+'results/'
    MODELSDIR=BASEDIR+'model/'
    DATADIR=BASEDIR+'data/'
    from src import models
    from src import pytorchtools
    from src.utils import createTensorDataset
    from src.training_functions import load_df, compute_bin_weights, save_stats_tensors, load_stats_tensors, get_znorm_params, get_subreddit_range, split_indices, get_subreddit_weights

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device == torch.device('cuda'):
    print(f"Device successfully set to cuda")
else:
    print("WARNING! DEVICE IS NOT SET TO CUDA")

print(torch.__version__)

In [None]:
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
import sys
sys.path.append(PYTHONDIR)

import random
import pickle
import time
import pdb
import importlib
import itertools
import pprint
import copy
import os

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F


# **1** Build the source data

* Build the source data in order to predict the EmT based on the sequence of comments in a thread

In [None]:
# DEFINE SUBREDDITS
SUBREDDITS  = ['Anxiety','bipolar','depression','SuicideWatch'] 
subreddit2title = {'depression':'DEP','suicidewatch':'SUI','anxiety':'ANX','bipolar':'BIP'}

EXTENSION = '.pkl'
STRATIFIED = True  # If true, will apply a stratified K-fold cross-validation to the dataset.
ZNORMALIZE = False # If true, will apply a z-normalization to the dataset
KEEP_TEXT = True # Only True if using Section 6 for Case Study

FILTERED=True # If true will use filtered seq_len for the threads
BIN_WIDTH=0.2 # Controls the width of the bins used to calculate the weighted L1 loss
MIN_VALUE = -1 # Controls the minimum output value(set to -1)


INCLUDE_TARGET = 0
INCLUDE_THREAD_COMMENTS = 0


MAX_BRANCH_LEN = 16   # not including authors' last comment
MAX_THREAD_LEN = 64   # not including authors' last comment

In [None]:
suffix='_distilbert_filtered_posts' + EXTENSION
df_list = {subreddit:load_df(DATADIR+subreddit+suffix, MAX_THREAD_LEN) \
           for subreddit in ['depression','Anxiety','bipolar','SuicideWatch']}

post_df = pd.concat((df_list[subreddit] for subreddit in SUBREDDITS), keys=SUBREDDITS)
del df_list

In [None]:

# post_df = pd.concat((load_df(DATADIR+subreddit+suffix, MAX_THREAD_LEN) \
#                         for subreddit in SUBREDDITS),keys=SUBREDDITS)


# TODO: it seems that we are keeping everything if KEEP_TEXT. is this really necessary?
if not KEEP_TEXT:
  post_df = post_df[['created_utc', 'seq_len','score', 'features', 'filtered_seqlen','valid_branches']]

# new strategy to construct observations: follow branches of every discussion tree 
#post_df.drop(columns=['valid_branches'],inplace=True)

if FILTERED:
    # dropna on filtered_seqlen, then replace seq_len by filtered_seqlen
    post_df.dropna(subset=['filtered_seqlen'], inplace=True)
    post_df.filtered_seqlen = post_df.filtered_seqlen.astype(int)

    post_df.drop(columns='seq_len',inplace=True)
    post_df.rename(columns={'filtered_seqlen':'seq_len'},inplace=True)
else:
    post_df.drop(columns='filtered_seqlen',inplace=True)

print(f'Fraction of threads that had to be truncated: {(post_df.seq_len>(MAX_THREAD_LEN+1)).mean()}')


In [None]:
# TODO: define constants upfront
if ZNORMALIZE:
  prefix = ''
else:
  prefix = 'unnorm_'

if ZNORMALIZE:
    src_m, src_s = get_znorm_params(post_df)
else:
    shape = [1,post_df.iloc[0].features.shape[-1]]
    src_m = torch.zeros(shape)
    src_s = torch.ones(shape)

save_stats_tensors(src_m,src_s,f'{BASEDIR}data/{prefix}')

score_m = float(src_m[0,-2])
score_s = float(src_s[0,-2])

print(f'Average score in dataset is {score_m}')

subreddit2range = get_subreddit_range()
print(subreddit2range[SUBREDDITS[0]])

# TODO: define constants upfront
suffix = "random"
if STRATIFIED:
  suffix += '_strat'

# TODO: place indexing in a new function and explain what it does
# get indices
if len(SUBREDDITS) == 4:
    train_inds, valid_inds, test_inds  = split_indices(post_df)
else:
    subreddit = SUBREDDITS[0]
    with open(f'{DATADIR}{subreddit}_{suffix}_splits.pkl','rb') as infile:
        splits = pickle.load(infile)
        train_locs =  [(subreddit,loc) for loc in splits[0]]
        valid_locs =  [(subreddit,loc) for loc in splits[1]]
        test_locs  =  [(subreddit,loc) for loc in splits[2]]

    train_inds = post_df.index.get_indexer_for(train_locs)
    valid_inds = post_df.index.get_indexer_for(valid_locs)
    test_inds  = post_df.index.get_indexer_for(test_locs)



# compute weights for Weighted L1 Loss
subreddit2weights = get_subreddit_weights(post_df, BIN_WIDTH,MIN_VALUE)
print(subreddit2weights)


In [None]:
%%time

# TODO: place the dataset generation in a function


print('Creating src')

print('Creating y')
y = torch.Tensor(post_df.apply(lambda p: p.score[p.seq_len-1], axis=1).values)

print('Creating src_len_series')
src_len_series = post_df.seq_len-1
max_length=MAX_THREAD_LEN
src = nn.utils.rnn.pad_sequence(
[ p.features[:min(MAX_THREAD_LEN,p.seq_len-1),:] for index, p in post_df.iterrows()], batch_first=True)
# src = nn.utils.rnn.pad_sequence(
# [ (p.features[:min(MAX_THREAD_LEN,p.seq_len-1),:]-src_m)/src_s for index, p in post_df.iterrows()], batch_first=True)
print(f"GRU src tensor size: {src.size()}")

print(f'y tensor size: {y.size()}')

if INCLUDE_TARGET:
  tgt = nn.utils.rnn.pad_sequence(
    [(p.features[b[-2]]-src_m)/src_s for index, p in post_df.iterrows() for b in p.valid_branches],
  batch_first=True)
else:
  tgt = None

  # clean up memory
  #if (not TEST) and (not KEEP_TEXT):
  if (not KEEP_TEXT):
    del post_df

  print('Creating dataset')
  if USE_GRU:
    dataset = createTensorDataset(src, src_len_series, y, max_length=max_length) # all threads, lim 63 comments
    del src
  
  del src_len_series, tgt, y

In [None]:
batch = dataset[0]
EMBEDDING_DIM = batch[0].shape[-1]
print(f"embedding dimension for GRU: {EMBEDDING_DIM}")


# **2**. Analyzing the effect of etiquette words on the Vader Score

## Plotting functions


In [None]:
def calculate_KDF(fluctuations_df):

    x = fluctuations_df['score_before'].to_numpy(dtype=float)
    y = fluctuations_df['score_after'].to_numpy(dtype=float)
    xmin = x.min()
    xmax = x.max()
    ymin = y.min()
    ymax = y.max() 
    # X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
    X, Y = np.mgrid[xmin:xmax:40j, ymin:ymax:40j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    values = np.vstack([x, y])
    kernel = stats.gaussian_kde(values)
    Z = np.reshape(kernel(positions).T, X.shape)
    return Z

In [None]:
def plot_score_dist(scores,title,n_bins=20,log_y=False):
    N = len(scores)
    fig, axs = plt.subplots(1,N, tight_layout=True,figsize=(5 * N,3),sharey=True)

    # N, bins, patches = axs[0].hist(scores, bins=n_bins)

    # axs[0].set_title(f"Emotional Tone Histogram ({title})")
    # axs[0].set_xlabel("EmT")
    # axs[0].set_ylabel(f"Number of posts and comments")
    # axs[0].axvline(color='k',linestyle='dashed')
    # axs[1].set_title(f"Emotional Tone Histogram ({title})")
    # axs[1].set_xlabel("EmT")
    # axs[1].set_ylabel(f"Number of posts and comments")
    # axs[1].axvline(color='k',linestyle='dashed')
    if type(axs) != type(np.array(0)):
        axs = [axs]
    for ind,ax in enumerate(axs):
        N, bins, patches = ax.hist(scores[ind], bins=n_bins)
        ax.set_title(f"Emotional Tone Histogram ({title[ind]})")
        ax.set_xlabel(r"$\Delta EmT$")
        if ind == 0:
            ax.set_ylabel(f"Number of posts")
        ax.axvline(color='k',linestyle='dashed')
        if (log_y):
            ax.set_yscale("log")




In [None]:
def plot_fluctuations(fluctuations_df,Z,explanation='w.r.t comments and posts that deviate'):
    x = fluctuations_df['score_before_x'].to_numpy(dtype=float)
    y = fluctuations_df['score_after_y'].to_numpy(dtype=float)
    xmin = x.min()
    xmax = x.max()
    ymin = y.min()
    ymax = y.max() 
    fig, ax = plt.subplots(1,1,figsize=(5,5))
    ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
            extent=[xmin, xmax, ymin, ymax])
    #ax.plot(x, y, 'k.', markersize=2)
    ax.set_xlim([xmin, xmax])

    ax.set_ylim([ymin, ymax])

    plt.plot([0,0],[-1,1],'dimgray',linestyle='dotted')
    plt.plot([-1,1],[0,0],'dimgray',linestyle='dotted')
    plt.plot([-1,1],[-1,1],'dimgray',label='y = x',linestyle='dashed')
    legend = ax.legend(loc='upper left', shadow=True, fontsize='medium')
    # ax.scatter(x[:50], y[:50], c='k', s=5, edgecolor='')
    # legend.get_frame().set_facecolor('C0')
    plt.xlabel("EmT before removing Etiquette words")
    plt.ylabel("EmT after removing Etiquette words")
    plt.title(f"Distribution of EmT before vs after ({explanation})")
    plt.show()

## Data Analysis

In [None]:
def get_fluct_df(post_df,scores,suffix,save_files=True):
    score_fluctuations_df = pd.DataFrame(index=list(range(len(scores))),columns=['id','score_before','score_after'])
    curr_idx = 0
    for post in tqdm(range(post_df.shape[0])):
        curr_iloc = post_df.iloc[post]
        if suffix == "_before":
            score_fluctuations_df.iloc[curr_idx] = [curr_iloc.name[1],curr_iloc.score[0],0]
        else:
             score_fluctuations_df.iloc[curr_idx] = [curr_iloc.name[1],0,curr_iloc.score[0]]
        comment_idx = 1
        for comment in curr_iloc.comments[:(curr_iloc.seq_len - 1)]:
            score_fluctuations_df.iloc[curr_idx + comment_idx] = [comment,0,curr_iloc.score[comment_idx]]
            comment_idx += 1
        curr_idx += comment_idx

    post_only_df = pd.DataFrame(index=list(range(len(post_df))),columns=['id','post_score_before','post_score_after'])
    if suffix == "_before":
        for post in tqdm(range(len(post_df))):
            post_only_df.iloc[post] = [post_df.iloc[post].name[1],post_df.iloc[post].score[0],0]
    else:
        for post in tqdm(range(len(post_df))):
            post_only_df.iloc[post] = [post_df.iloc[post].name[1],0,post_df.iloc[post].score[0]]

    if save_files:
        post_only_df.to_pickle(DATADIR + f"post_only_fluct{suffix}.pkl")
        score_fluctuations_df.to_pickle(DATADIR + f"score_fluct{suffix}.pkl")
    return score_fluctuations_df,post_only_df

In [None]:
CREATE_NO_THX_SCORES = False
CREATE_THX_SCORES = False

In [None]:
if CREATE_NO_THX_SCORES:  
    no_thx_scores = []
    for post in tqdm(range(post_df.shape[0])):
        no_thx_scores.append(post_df.iloc[post].score[:post_df.iloc[post].seq_len])
    no_thx_scores = list(itertools.chain.from_iterable(no_thx_scores))
    pickle.dump(no_thx_scores,open(DATADIR  + 'no_thx_scores.pkl','wb'))
else:
    no_thx_scores = pickle.load(open(DATADIR  + 'no_thx_scores.pkl','rb'))


In [None]:
if CREATE_THX_SCORES:  
    thx_scores = []
    for post in range(post_df.shape[0]):
        thx_scores.append(post_df.iloc[post].score[:post_df.iloc[post].seq_len])
    thx_scores = list(itertools.chain.from_iterable(thx_scores))
    pickle.dump(thx_scores,open(DATADIR + "no_thx_2017/" 'thx_scores.pkl','wb'))
else:
    thx_scores = pickle.load(open(DATADIR  + 'thx_scores.pkl','rb'))

In [None]:
plot_score_dist([thx_scores],title=['filtered','unfiltered'],n_bins=20)

In [None]:
# POSTS + COMMENTS
STEP_1 = False
STEP_2 = True
if STEP_1:
    CREATE_FLUCTUATIONS_DF = False
    if DATADIR == BASEDIR+'data/':
        suffix = "_before"
        scores = thx_scores
    else:
        suffix = "_after"
        scores = no_thx_scores

    if CREATE_FLUCTUATIONS_DF:
        score_df,post_only_df = get_fluct_df(post_df,scores,suffix)
    else:
        score_df_after = pickle.load(open(DATADIR  + f"score_fluct_after.pkl",'rb'))
        score_df_before = pickle.load(open(DATADIR  + f"score_fluct_before.pkl",'rb'))
        post_only_df_after = pickle.load(open(DATADIR  + f"post_only_fluct_after.pkl",'rb'))
        post_only_df_before = pickle.load(open(DATADIR  + f"post_only_fluct_before.pkl",'rb'))
if STEP_2:
    CONCATENATE_DFS = False
    if CONCATENATE_DFS:
        final_score_df = pd.merge(score_df_before,score_df_after,how='inner',on=['id'])
        final_score_df.drop(['score_after_x'],axis=1,inplace=True)
        final_score_df.drop(['score_before_y'],axis=1,inplace=True)

        final_post_only_df = pd.merge(post_only_df_before,post_only_df_after,how='inner',on=['id'])
        final_post_only_df.drop(['post_score_after_x'],axis=1,inplace=True)
        final_post_only_df.drop(['post_score_before_y'],axis=1,inplace=True)

        final_score_df.to_pickle(DATADIR + "final_score_fluct_df.pkl")
        final_post_only_df.to_pickle(DATADIR + "final_post_fluct_df.pkl")
    else:
        post_only_df = pd.read_pickle(DATADIR + "final_post_fluct_df.pkl")
        score_fluct_df = pd.read_pickle(DATADIR + "final_score_fluct_df.pkl")

In [None]:
#sanity check
post_counter = 0
for i in tqdm(range(len(score_fluct_df))):
    if score_fluct_df.iloc[i].id[0] != 'd': # 'd' indicates that the author is a commenter
        try:
            assert (post_only_df.iloc[post_counter].post_score_before_x == score_fluct_df.iloc[i].score_before_x)
            assert (post_only_df.iloc[post_counter].post_score_after_y == score_fluct_df.iloc[i].score_after_y)
        except AssertionError:
            print(post_only_df.iloc[post_counter])
            print(score_fluct_df.iloc[i])
        post_counter += 1


In [None]:
diff_df = score_fluct_df['score_after_y'] - score_fluct_df['score_before_x']
post_diff_df = post_only_df['post_score_after_y'] - post_only_df['post_score_before_x']

In [None]:
diff_df = score_fluct_df['score_after_y'] - score_fluct_df['score_before_x']
print(f"Average deviation from original score w.r.t. to all posts/comments: {diff_df.sum()/diff_df.shape[0]}")
print(f"Percentage of posts/comments that deviated from the original score: {(sum(diff_df != 0)/diff_df.shape[0])*100}%")
posts_that_deviate = diff_df[diff_df.index[diff_df != 0]]
print(f"Average deviation from original score w.r.t. to posts/comments that deviate: {posts_that_deviate.sum()/posts_that_deviate.shape[0]}")

In [None]:
plot_score_dist([diff_df.to_list()],title=['fluctuations'],n_bins=20)

In [None]:
z_values_total = calculate_KDF(score_fluct_df)
z_values_deviate = calculate_KDF(score_fluct_df.iloc[posts_that_deviate.index])

In [None]:
plot_fluctuations(score_fluct_df,z_values_total,explanation='all comments + posts')
plot_fluctuations(score_fluct_df.iloc[posts_that_deviate.index],z_values_deviate,explanation=r'posts + comments w/ $\Delta EmT \neq 0$')

In [None]:
post_diff_df = post_only_df['post_score_after_y'] - post_only_df['post_score_before_x']
print(f"Average deviation from original score w.r.t. to all posts: {post_diff_df.sum()/post_diff_df.shape[0]}")
print(f"Percentage of posts that deviated from the original score: {(sum(post_diff_df != 0)/post_diff_df.shape[0])*100}%")
posts_only_that_deviate = post_diff_df[post_diff_df.index[post_diff_df != 0]]
print(f"Average deviation from original score w.r.t. to posts that deviate: {posts_only_that_deviate.sum()/posts_only_that_deviate.shape[0]}")

In [None]:
post_only_df.columns = ['id','score_before_x','score_after_y']
post_z_values_total = calculate_KDF(post_only_df)
post_z_values_deviate = calculate_KDF(post_only_df.iloc[posts_only_that_deviate.index])

In [None]:
plot_score_dist([post_diff_df.to_list()],n_bins=20,title=[r'fluctuations'])

In [None]:
plot_fluctuations(post_only_df,post_z_values_total,explanation='all posts')
plot_fluctuations(post_only_df.iloc[posts_only_that_deviate.index],post_z_values_deviate,explanation=r'posts w/ $\Delta EmT \neq 0$')
# plot_score_dist(post_diff_df.to_list(),n_bins=20)

In [None]:
def plot_fluctuations(fluctuations_df,Z,df_x,df_y,title,xlabel,ylabel,explanation='w.r.t comments and posts that deviate'):
    x = fluctuations_df[df_x].to_numpy(dtype=float)
    y = fluctuations_df[df_y].to_numpy(dtype=float)
    xmin = x.min()
    xmax = x.max()
    ymin = y.min()
    ymax = y.max() 
    fig, ax = plt.subplots(1,1,figsize=(5,5))
    ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
            extent=[xmin, xmax, ymin, ymax])
    #ax.plot(x, y, 'k.', markersize=2)
    ax.set_xlim([xmin, xmax])

    ax.set_ylim([ymin, ymax])

    plt.plot([0,0],[-1,1],'dimgray',linestyle='dotted')
    plt.plot([-1,1],[0,0],'dimgray',linestyle='dotted')
    #plt.plot([-1,1],[-1,1],'dimgray',label='y = x',linestyle='dashed')
    #legend = ax.legend(loc='upper left', shadow=True, fontsize='medium')
    # ax.scatter(x[:50], y[:50], c='k', s=5, edgecolor='')
    # legend.get_frame().set_facecolor('C0')
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.show()

In [None]:
score_fluct_df = pd.DataFrame(index=list(range(post_df.shape[0])),columns=['score_before','score_after'])

In [None]:
for post in range(post_df.shape[0]):
    score_fluct_df.iloc[post]['score_before'] = post_df.iloc[post].score[0]
    score_fluct_df.iloc[post]['score_after'] = post_df.iloc[post].score[post_df.iloc[post].seq_len - 1]


In [None]:
z_values_fluct = calculate_KDF(score_fluct_df)

In [None]:
plot_fluctuations(score_fluct_df,z_values_fluct)

#**3** Confidence Interval Experiements

In [None]:
if len(SUBREDDITS) == 4:
    SRC_DATASET='all'
if len(SUBREDDITS) == 1:
    SRC_DATASET=SUBREDDITS[0]

## Normalizing MEAN and XGB w.r.t. real scores

In [None]:
tmp_df = pd.read_csv(f'{RESULTSDIR}{PREFIX}_{SRC_DATASET}_predictions.csv',index_col=0)
tmp_df.head()


In [None]:
l1_loss_unnormalized_xgb = (tmp_df['xgb-mse'] - tmp_df['final score']).abs().mean()
l1_loss_unnormalized_mean = (tmp_df['mean'] - tmp_df['final score']).abs().mean()
print(f"XGB UNNORMALIZED L1 LOSS: {l1_loss_unnormalized_xgb}")
print(f"MEAN UNNORMALIZED L1 LOSS: {l1_loss_unnormalized_mean}")

In [None]:
mu_true,std_true = tmp_df['final score'].describe().loc[['mean','std']]
mu_pred_xgb,std_pred_xgb = tmp_df['xgb-mse'].describe().loc[['mean','std']]
mu_pred_mean,std_pred_mean = tmp_df['mean'].describe().loc[['mean','std']]

In [None]:
tmp_df['xgb-mse'] = (tmp_df['xgb-mse'] - mu_pred_xgb)/std_pred_xgb * std_true + mu_true
tmp_df['mean'] = (tmp_df['mean'] - mu_pred_mean)/std_pred_mean * std_true + mu_true

In [None]:
l1_loss_normalized_xgb = (tmp_df['xgb-mse'] - tmp_df['final score']).abs().mean()
l1_loss_normalized_mean = (tmp_df['mean'] - tmp_df['final score']).abs().mean()
print(f"XGB NORMALIZED L1 LOSS: {l1_loss_normalized_xgb}")
print(f"MEAN NORMALIZED L1 LOSS: {l1_loss_normalized_mean}")

In [None]:
def calculate_KDF(fluctuations_df,df_y):

    x = fluctuations_df['final score'].to_numpy(dtype=float)
    y = fluctuations_df[df_y].to_numpy(dtype=float)
    xmin = x.min()
    xmax = x.max()
    ymin = y.min()
    ymax = y.max() 
    # X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
    X, Y = np.mgrid[xmin:xmax:40j, ymin:ymax:40j]
    positions = np.vstack([X.ravel(), Y.ravel()])
    values = np.vstack([x, y])
    kernel = stats.gaussian_kde(values)
    Z = np.reshape(kernel(positions).T, X.shape)
    return Z

In [None]:
def plot_dist(fluctuations_df,subreddits,axs_list):
    fig, axsl = plt.subplots(4,2,figsize=(4,8),sharey=True,tight_layout=True)
    fig.text(0.6 , -0.01, r'EmT($c_n$) (true value)', ha='center', fontsize='medium')
    fig.text(-0.01, 0.5 , 'prediction', va='center', rotation='vertical', fontsize='medium')
    for ax, col in zip(axsl[0], axs_list[:]):
        ax.set_title(col.upper(), size='large')
    for ax, row in zip(axsl[:,0], subreddits):
    # ax.yaxis.set_label_position("right")
        ax.set_ylabel(subreddit2title[row.lower()], rotation=0, size='large', horizontalalignment='right')
    for indx,axs in enumerate(axsl):
        for ind,ax in enumerate(axs):  
            x = fluctuations_df[axs_list[ind]].to_numpy(dtype=float)
            y = fluctuations_df['final score'].to_numpy(dtype=float)
            Z = calculate_KDF(tmp_df.loc[subreddit2range[subreddits[indx]]],axs_list[ind])
            xmin = x.min()
            xmax = x.max()
            ymin = y.min()
            ymax = y.max() 
            ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
                    extent=[xmin, xmax, ymin, ymax],aspect='auto')

            #ax.plot(x, y, 'k.', markersize=2)
            ax.set_xlim([xmin, xmax])
            ax.set_ylim([ymin, ymax])
            # ax.plot([0,0],[xmin,ymax],'dimgray',linestyle='dotted')
            # ax.plot([xmin,ymax],[0,0],'dimgray',linestyle='dotted')
            ax.axvline(color='dimgray',linestyle='dashed')
            ax.axhline(color='dimgray',linestyle='dashed')
            # ax.set_xlabel(r'$EmT(c_n)$')
            # ax.set_ylabel('predictions')
    cb = plt.colorbar(            ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
                    extent=[xmin, xmax, ymin, ymax],aspect='auto'), cax = fig.add_axes([0.3, 1.01, .6, 0.01]), orientation='horizontal')
    cb.ax.set_title('density', size='large')
    plt.show()

In [None]:
plot_dist(tmp_df,SUBREDDITS,['mean','xgb'])

## Robustness tests

### Generating 5 different splits

In [None]:
def reset_seeds_test(SEED):
  random.seed(SEED)
  np.random.seed(SEED)
  torch.manual_seed(SEED)
  torch.backends.cudnn.deterministic = True
  torch.cuda.manual_seed(SEED)
  torch.backends.cudnn.deterministic = True

In [None]:
def split_indices_test(post_df,seed):
  
    nthreads = len(post_df)     # number of threads
    if STRATIFIED:
        y = post_df.apply(lambda p: p.score[p.seq_len-1], axis=1).values
        bins = np.floor((y - MIN_VALUE)/BIN_WIDTH).astype(int)

        reset_seeds_test(seed)
        fold_size = int(0.1*len(y))
        remaining_inds,valid_inds,_,_ = train_test_split(np.arange(nthreads),y,test_size=fold_size,stratify=bins)
        train_inds,test_inds,_,_ = train_test_split(remaining_inds,y[remaining_inds],test_size=fold_size,stratify=bins[remaining_inds])

        train_inds = np.sort(train_inds).tolist()
        valid_inds = np.sort(valid_inds).tolist()
        test_inds  = np.sort(test_inds).tolist()

    else:
        # divide randomly
        reset_seeds_test(seed)
        assigned_set = np.random.multinomial(1,[.8,.1,.1],nthreads)
        
        train_inds = list(np.argwhere(assigned_set[:,0]).ravel())
        valid_inds = list(np.argwhere(assigned_set[:,1]).ravel())
        test_inds  = list(np.argwhere(assigned_set[:,2]).ravel())

    print(f"Number of training examples: {len(train_inds)}")
    print(f"Number of validation examples: {len(valid_inds)}")
    print(f"Number of testing examples: {len(test_inds)}")

    return train_inds, valid_inds, test_inds

In [None]:
seeds = [124,125,126,127,128]
test_errors = []

In [None]:

# TODO: all the constants should be defined or computed in the first cell
suffix='_distilbert_filtered_posts.pkl'
# if LOW_MEMORY:
#     df_list = list()
#     for subreddit in SUBREDDITS:
#         curr_df = pd.read_pickle(DATADIR+subreddit+suffix)
#         curr_df.drop(columns=['features'],inplace=True)
#         df_list.append(curr_df)
#     post_df = pd.concat((df_list[i]) \ for i in range(len(SUBREDDITS)))
# else:
post_df = pd.concat((load_df(DATADIR+subreddit+suffix, MAX_THREAD_LEN) \
                        for subreddit in SUBREDDITS),keys=SUBREDDITS)

# post_df = post_df.sample(10000)
# print(post_df.loc[subreddit2post[SUBREDDITS[0]]])

# TODO: it seems that we are keeping everything if KEEP_TEXT. is this really necessary?
if not KEEP_TEXT:
  post_df = post_df[['created_utc', 'seq_len','score', 'features', 'filtered_seqlen','valid_branches']]

# new strategy to construct observations: follow branches of every discussion tree 
#post_df.drop(columns=['valid_branches'],inplace=True)

if FILTERED:
    # dropna on filtered_seqlen, then replace seq_len by filtered_seqlen
    post_df.dropna(subset=['filtered_seqlen'], inplace=True)
    post_df.filtered_seqlen = post_df.filtered_seqlen.astype(int)

    post_df.drop(columns='seq_len',inplace=True)
    post_df.rename(columns={'filtered_seqlen':'seq_len'},inplace=True)
else:
    post_df.drop(columns='filtered_seqlen',inplace=True)

print(f'Fraction of threads that had to be truncated: {(post_df.seq_len>(MAX_THREAD_LEN+1)).mean()}')


In [None]:
#PART 1
print("PART1")
prefix = 'unnorm_'
shape = [1,post_df.iloc[0].features.shape[-1]]
src_m = torch.zeros(shape)
src_s = torch.ones(shape)
score_m = float(src_m[0,-2])
score_s = float(src_s[0,-2])

print(f'Average score in dataset is {score_m}')

subreddit2range = get_subreddit_range()
print(subreddit2range[SUBREDDITS[0]])

suffix = "random"
if STRATIFIED:
    suffix += '_strat'

# compute weights for Weighted L1 Loss
subreddit2weights = get_subreddit_weights(post_df, BIN_WIDTH,MIN_VALUE)
print(subreddit2weights)

#PART 2
print("PART2")

print('Creating src')

print('Creating y')
y = torch.Tensor(post_df.apply(lambda p: p.score[p.seq_len-1], axis=1).values)

print('Creating src_len_series')
src_len_series = post_df.seq_len-1
max_length=MAX_THREAD_LEN

src = nn.utils.rnn.pad_sequence(
[ (p.features[:min(MAX_THREAD_LEN,p.seq_len-1),:]-src_m)/src_s for index, p in post_df.iterrows()], batch_first=True)
print(f"GRU src tensor size: {src.size()}")

print(f'y tensor size: {y.size()}')
tgt = None

print('Creating dataset')
if USE_GRU:
    #dataset = createTensorDataset(src, src_len_series, y, tgt=tgt, max_length=max_length) # all threads, lim 63 comments
    dataset = createTensorDataset(src, src_len_series, y, max_length=max_length) # all threads, lim 63 comments
    del src

del src_len_series, tgt, y

In [None]:
if type(SUBREDDITS) != list:
    SUBREDDITS = list(SUBREDDITS)
N_EPOCHS=20 # maximum number of epochs to train the model
PATIENCE=3 # constant that controls the Early Stopping mechanism of the grid search for the GRU model
SRC_DATASET='all'
PREFIX ='random'
if STRATIFIED:
    PREFIX += '_strat'
results_filename=f'{RESULTSDIR}{PREFIX}_{SRC_DATASET}_b{int(2./BIN_WIDTH):02}_f{FILTERED}_n{N_EPOCHS}_p{PATIENCE}_mGRUv1.pkl'
print(f'Results file is {results_filename}')
for curr_seed in seeds:
    train_inds, valid_inds, test_inds  = split_indices_test(post_df,curr_seed)

    #PART 3
    print("PART3")
    train_loader = DataLoader(Subset(dataset,train_inds), batch_size=32, shuffle=True, num_workers=1)
    valid_loader = DataLoader(Subset(dataset,valid_inds), batch_size=len(valid_inds), shuffle=False, num_workers=1)
    test_loader  = DataLoader(Subset(dataset, test_inds), batch_size=len(test_inds), shuffle=False, num_workers=1)

    #PART 4
    print("PART4")
# load best model

    RETRAIN = False
    USE_LOSS_BEST = False
    USE_BEST_VALIDATION = True
    criterion = WeightedL1Loss(subreddit2weights['all'], BIN_WIDTH, MIN_VALUE)
    #criterion = nn.MSELoss()
    criterion_name = lambda x: x.__class__.__name__.split('.')[-1]


    # see if the file with best params is available
    # results_df = pd.read_pickle(results_filename)

    if USE_LOSS_BEST:
        best_result = results_df.loc[results_df[criterion_name(criterion)+'_best'].argmin()]
    else:
        best_result = results_df.loc[results_df[criterion_name(criterion)].argmin()]

    # uncomment to load specific model instead
    # best_result = results_df.iloc[0]
    if RETRAIN:
        reset_seeds()
        model = models.GRUSentiment(best_result.params)
        _, valid_loss, _ = train_over_nepochs( model, train_loader, valid_loader,
                                            criterion=criterion, device=device,
                                            patience=3, n_epochs=N_EPOCHS)

        if USE_BEST_VALIDATION:
            model.load_state_dict(torch.load('checkpoint.pt', map_location=lambda storage, loc: storage))

    else:
        model = models.GRUSentiment(best_result.params)
        if USE_BEST_VALIDATION:
            model.load_state_dict(best_result.para)
        else:
            model.load_state_dict(best_result.para_best)

    model.to(device)

    test_loss, outputs = evaluate(model, iter(test_loader), criterion=criterion, device=device, return_predictions=True)
    model_yhat = outputs[0][0].cpu().numpy() # extract data from outputs
    ilocs = outputs[0][2].cpu().numpy()
    model_series = pd.Series(model_yhat.ravel(),index=ilocs)
    del outputs

    # print model params
    print(best_result.params)
    print(f'Test loss: {test_loss:.3f}')
    test_errors.append(test_loss)

In [None]:
print("Results:")
print(f"Test Errors mean: {np.mean(test_errors)}")
print(f"Test Errors std: {np.std(test_errors)}")

### Ramdomly initializing weights

In [None]:
seeds = [124,125,126,127,128]
test_errors2 = []

In [None]:

train_inds, valid_inds, test_inds  = split_indices_test(post_df,1234)
#PART 3
print("PART3")
train_loader = DataLoader(Subset(dataset,train_inds), batch_size=32, shuffle=True, num_workers=1)
valid_loader = DataLoader(Subset(dataset,valid_inds), batch_size=len(valid_inds), shuffle=False, num_workers=1)
test_loader  = DataLoader(Subset(dataset, test_inds), batch_size=len(test_inds), shuffle=False, num_workers=1)
for curr_seed in seeds:
    #PART 4
    print("PART4")
    # load best model
    # RETRAIN = True
    # USE_LOSS_BEST = False
    # USE_BEST_VALIDATION = True
    criterion = WeightedL1Loss(subreddit2weights['all'], BIN_WIDTH, MIN_VALUE)
    #criterion = nn.MSELoss()
    criterion_name = lambda x: x.__class__.__name__.split('.')[-1]
    # see if the file with best params is available
    results_df = pd.read_pickle(results_filename)
    reset_seeds_test(curr_seed)
    best_result = results_df.loc[results_df[criterion_name(criterion)].argmin()]
    model = models.GRUSentiment(best_result.params)
    _, valid_loss, _ = train_over_nepochs( model, train_loader, valid_loader,
                                        criterion=criterion, device=device,
                                        patience=3, n_epochs=N_EPOCHS)
  

    model.load_state_dict(torch.load('checkpoint.pt', map_location=lambda storage, loc: storage))
    model.to(device)
    test_loss, outputs = evaluate(model, iter(test_loader), criterion=criterion, device=device, return_predictions=True)
    model_yhat = outputs[0][0].cpu().numpy() # extract data from outputs
    ilocs = outputs[0][2].cpu().numpy()
    model_series = pd.Series(model_yhat.ravel(),index=ilocs)
    del outputs
    # print model params
    print(best_result.params)
    print(f'Test loss: {test_loss:.3f}')
    test_errors2.append(test_loss)

In [None]:
print("Results:")
print(f"Test Errors mean: {np.mean(test_errors2)}")
print(f"Test Errors std: {np.std(test_errors2)}")