# Libraries

In [99]:
import pandas as pd
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from itertools import islice
from tqdm import tqdm
tqdm.pandas()

# Read Data

## Lexicon

In [430]:
vad_lexicon = pd.read_csv('./data/vad/NRC-VAD-Lexicon-v2.1-edited.txt', sep='\t', header=0, keep_default_na=False, na_values=[])

In [431]:
# filter vad_lexicon to only include rows where there are duplicate 'term' values
vad_lexicon_dup = vad_lexicon[vad_lexicon['term'].isin(vad_lexicon['term'][vad_lexicon['term'].duplicated()])]
vad_lexicon_dup


Unnamed: 0,term,valence,arousal,dominance


### 5-gram lexicons

In [432]:
# filter only 5-grams lexicons
five_grams_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 5].copy()

### 4-gram lexicons

In [433]:
# filter only 4-grams lexicons
four_grams_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 4].copy()

### 3-gram lexicons

In [434]:
# filter only 3-grams lexicons
three_grams_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 3].copy()

### 2-gram lexicons

In [435]:
# filter only 2-grams lexicons
two_grams_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 2].copy()

### 1-gram lexicons

In [436]:
# filter only 1-gram lexicons
one_gram_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 1].copy()

## Mental Health Severity Data

### Unbounded

In [360]:
mental_health_severity_table = pd.read_csv('./data/BeyondBlue/commented_post_authors_edited.csv', header=0)

In [361]:
mental_health_severity_table

Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,Valued Contributor,Days_Between
0,Anxi-6,Amenace,"Parental Anxiety. Hi Everyone, I am not quite ...",Thank you Indigo. I am seeing a counsellor ton...,0,0,0,1,0,0,0,0
1,Anxi-12,Whatsinaname,"Horrible week. Hi everyone, I am having a horr...",Since my last post I ended up quitting my job....,0,2,28,3,8,0,1,1765
2,Anxi-13,Sammy,New relationship anxiety. Dear adjust need a c...,Such a good idea this too shallpass thanks for...,0,0,0,0,2,0,0,161
3,Anxi-17,Olive83,Easy strategies for quick response. Recently m...,You are not wrong Caught between! And I did ex...,0,0,0,1,4,0,1,8
4,Anxi-28,jordan,"Vomiting, GAD, herd. Hello everyone, I have ha...",I am replying after a year. Thank you for your...,0,0,0,0,1,1,0,359
...,...,...,...,...,...,...,...,...,...,...,...,...
8806,Suic-1308,Apple2468,Confused looking for support. Hello Since a ch...,Thanks for sharing those links with me. Readin...,0,0,1,0,0,1,0,0
8807,Suic-1309,jujusbizarrecircus,Intrusive thoughts. For the past two years I h...,"Hey Joseph, I am actually making a homebred ca...",0,0,0,2,1,1,0,2
8808,Suic-1310,Teegs_,I do not know how to keep living. This has bee...,"Thank you for your replies, I really appreciat...",0,0,0,0,2,1,0,30
8809,Suic-1313,lizzie50,13 Reasons Why. I am not sure if anyone has se...,Thanks for the reply Mary! The people I work w...,0,0,3,3,3,0,1,23


### Bounded (14 days)

In [362]:
mental_health_severity_table_max14days = pd.read_csv('./data/BeyondBlue/commented_post_authors_max14days.csv', header=0)

In [363]:
mental_health_severity_table_max14days.head()

Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,Valued Contributor,Days_Between
0,Anxi-6,Amenace,"Parental Anxiety. Hi Everyone, I am not quite ...",Thank you Indigo. I am seeing a counsellor ton...,0,0,0,1,0,0,0,0
1,Anxi-12,Whatsinaname,"Horrible week. Hi everyone, I am having a horr...","Hi Petal, Thanks again for posting, your posit...",0,2,12,0,0,0,0,9
2,Anxi-13,Sammy,New relationship anxiety. Dear adjust need a c...,"Thanks ranges I told him, and he apologized an...",0,0,0,0,2,0,0,1
3,Anxi-17,Olive83,Easy strategies for quick response. Recently m...,You are not wrong Caught between! And I did ex...,0,0,0,1,4,0,1,8
4,Anxi-34,Gobble,Any idea?. I need a little help. On a generali...,She then spread rumors horrid things that coul...,0,0,0,2,0,0,0,2


# Main

## Check the Maximum N-gram in 'term'

In [13]:
# check the maximum n-gram in 'term'
max(vad_lexicon['term'].apply(lambda x: len(x.split(' '))))

5

## `vad()`

In [528]:
def vad(
        text, 
        five_grams_lexicon=five_grams_lexicon, 
        four_grams_lexicon=four_grams_lexicon, 
        three_grams_lexicon=three_grams_lexicon, 
        two_grams_lexicon=two_grams_lexicon, 
        one_gram_lexicon=one_gram_lexicon,
        polar_subset=False):
    """
    Calculate Valence, Arousal, and Dominance (VAD) scores for a given text using the NRC VAD Lexicon.

    Parameters:
    text (str): The input text to analyze.
    lexicon (pd.DataFrame): The VAD lexicon DataFrame with columns 'term', 'valence', 'arousal', 'dominance'.

    Returns:
    dict: A dictionary with average 'valence', 'arousal', and 'dominance' scores.
    """
    # Preprocess the text: lowercase and split into words
    words = word_tokenize(text.lower())
    
    # Initialize lists to store VAD scores
    valence_scores = []
    arousal_scores = []
    dominance_scores = []

    # initialize list to store matched n-grams and set to store their positions
    matched_ngrams = []
    matched_ngrams_arousal = []
    matched_ngrams_dominance = []
    matched_ngrams_valence = []
    matched_positions = set()
    matched_positions_arousal = set()
    matched_positions_dominance = set()
    matched_positions_valence = set()

    ## 5-grams
    # check for 5-grams of text in lexicon
    five_grams = list(ngrams(words, 5))
    five_grams = [' '.join(gram) for gram in five_grams]

    # record the positions of words that are part of matched 5-grams
    five_grams_positions = list(ngrams(range(len(words)), 5))
    five_grams_positions = [set(pos) for pos in five_grams_positions]

    if polar_subset:
        # filter lexicon to only polar subsets if polar_subset is True
        five_grams_lexicon_arousal = five_grams_lexicon[(five_grams_lexicon['arousal']<=-0.333) | (five_grams_lexicon['arousal']>=0.333)].copy()
        five_grams_lexicon_dominance = five_grams_lexicon[(five_grams_lexicon['dominance']<=-0.333) | (five_grams_lexicon['dominance']>=0.333)].copy()
        five_grams_lexicon_valence = five_grams_lexicon[(five_grams_lexicon['valence']<=-0.333) | (five_grams_lexicon['valence']>=0.333)].copy()
        # temporary sets to track matched positions for each dimension for this 5-gram when polar_subset is True
        matched_positions_arousal_5gram = set()
        matched_positions_dominance_5gram = set()
        matched_positions_valence_5gram = set()
    else:
        matched_positions_5gram = set()

    # check for matches in 5-grams
    for gram, pos in zip(five_grams, five_grams_positions):
        if polar_subset:
            match_arousal = five_grams_lexicon_arousal[five_grams_lexicon_arousal['term'] == gram]
            match_dominance = five_grams_lexicon_dominance[five_grams_lexicon_dominance['term'] == gram]
            match_valence = five_grams_lexicon_valence[five_grams_lexicon_valence['term'] == gram]
            if not match_valence.empty:
                valence_scores.append(float(match_valence['valence'].values[0]))
                matched_ngrams_valence.append(gram)
                matched_positions_valence_5gram.update(pos)
            if not match_arousal.empty:
                arousal_scores.append(float(match_arousal['arousal'].values[0]))
                matched_ngrams_arousal.append(gram)
                matched_positions_arousal_5gram.update(pos)
            if not match_dominance.empty:
                dominance_scores.append(float(match_dominance['dominance'].values[0]))
                matched_ngrams_dominance.append(gram)
                matched_positions_dominance_5gram.update(pos)
        else:
            match = five_grams_lexicon[five_grams_lexicon['term'] == gram]
            if not match.empty:
                valence_scores.append(float(match['valence'].values[0]))
                arousal_scores.append(float(match['arousal'].values[0]))
                dominance_scores.append(float(match['dominance'].values[0]))
                matched_ngrams.append(gram)
                matched_positions_5gram.update(pos)
    # update the main matched positions sets with the temporary sets for this 5-gram when polar_subset is True
    if polar_subset:
        matched_positions_valence.update(matched_positions_valence_5gram)
        matched_positions_arousal.update(matched_positions_arousal_5gram)
        matched_positions_dominance.update(matched_positions_dominance_5gram)
    else:
        matched_positions.update(matched_positions_5gram)

    ## 4-grams
    # check for 4-grams of text in lexicon
    four_grams = list(ngrams(words, 4))
    four_grams = [' '.join(gram) for gram in four_grams]

    # record the positions of words that are part of matched 4-grams
    four_grams_positions = list(ngrams(range(len(words)), 4))
    four_grams_positions = [set(pos) for pos in four_grams_positions]

    if polar_subset:
        # filter lexicon to only polar subsets if polar_subset is True
        four_grams_lexicon_arousal = four_grams_lexicon[(four_grams_lexicon['arousal']<=-0.333) | (four_grams_lexicon['arousal']>=0.333)].copy()
        four_grams_lexicon_dominance = four_grams_lexicon[(four_grams_lexicon['dominance']<=-0.333) | (four_grams_lexicon['dominance']>=0.333)].copy()
        four_grams_lexicon_valence = four_grams_lexicon[(four_grams_lexicon['valence']<=-0.333) | (four_grams_lexicon['valence']>=0.333)].copy()
        # temporary sets to track matched positions for each dimension for this 4-gram when polar_subset is True
        matched_positions_arousal_4gram = set()
        matched_positions_dominance_4gram = set()
        matched_positions_valence_4gram = set()
    else:
        matched_positions_4gram = set()

    # check for matches in 4-grams
    for gram, pos in zip(four_grams, four_grams_positions):
        if polar_subset:
            match_arousal = four_grams_lexicon_arousal[four_grams_lexicon_arousal['term'] == gram]
            match_dominance = four_grams_lexicon_dominance[four_grams_lexicon_dominance['term'] == gram]
            match_valence = four_grams_lexicon_valence[four_grams_lexicon_valence['term'] == gram]
            if not match_valence.empty:
                if pos in matched_positions_valence:
                    continue  # Skip if any word in the 4-gram is part of a matched n-gram
                valence_scores.append(float(match_valence['valence'].values[0]))
                matched_ngrams_valence.append(gram)
                matched_positions_valence_4gram.update(pos)
            if not match_arousal.empty:
                if pos in matched_positions_arousal:
                    continue  # Skip if any word in the 4-gram is part of a matched n-gram
                arousal_scores.append(float(match_arousal['arousal'].values[0]))
                matched_ngrams_arousal.append(gram)
                matched_positions_arousal_4gram.update(pos)
            if not match_dominance.empty:
                if pos in matched_positions_dominance:
                    continue  # Skip if any word in the 4-gram is part of a matched n-gram
                dominance_scores.append(float(match_dominance['dominance'].values[0]))
                matched_ngrams_dominance.append(gram)
                matched_positions_dominance_4gram.update(pos)
        else:
            if not matched_positions.isdisjoint(pos):
                continue  # Skip if any word in the 4-gram is part of a matched n-gram
            match = four_grams_lexicon[four_grams_lexicon['term'] == gram]
            if not match.empty:
                valence_scores.append(float(match['valence'].values[0]))
                arousal_scores.append(float(match['arousal'].values[0]))
                dominance_scores.append(float(match['dominance'].values[0]))
                matched_ngrams.append(gram)
                matched_positions_4gram.update(pos)
    # update the main matched positions sets with the temporary sets for this 4-gram when polar_subset is True
    if polar_subset:
        matched_positions_valence.update(matched_positions_valence_4gram)
        matched_positions_arousal.update(matched_positions_arousal_4gram)
        matched_positions_dominance.update(matched_positions_dominance_4gram)
    else:
        matched_positions.update(matched_positions_4gram)
    
    ## 3-grams
    # check for 3-grams of text in lexicon
    three_grams = list(ngrams(words, 3))
    three_grams = [' '.join(gram) for gram in three_grams]

    # record the positions of words that are part of matched 3-grams
    three_grams_positions = list(ngrams(range(len(words)), 3))
    three_grams_positions = [set(pos) for pos in three_grams_positions]

    if polar_subset:
        # filter lexicon to only polar subsets if polar_subset is True
        three_grams_lexicon_arousal = three_grams_lexicon[(three_grams_lexicon['arousal']<=-0.333) | (three_grams_lexicon['arousal']>=0.333)].copy()
        three_grams_lexicon_dominance = three_grams_lexicon[(three_grams_lexicon['dominance']<=-0.333) | (three_grams_lexicon['dominance']>=0.333)].copy()
        three_grams_lexicon_valence = three_grams_lexicon[(three_grams_lexicon['valence']<=-0.333) | (three_grams_lexicon['valence']>=0.333)].copy()
        # temporary sets to track matched positions for each dimension for this 3-gram when polar_subset is True
        matched_positions_arousal_3gram = set()
        matched_positions_dominance_3gram = set()
        matched_positions_valence_3gram = set()
    else:
        matched_positions_3gram = set()

    # check for matches in 3-grams
    for gram, pos in zip(three_grams, three_grams_positions):
        if polar_subset:
            match_arousal = three_grams_lexicon_arousal[three_grams_lexicon_arousal['term'] == gram]
            match_dominance = three_grams_lexicon_dominance[three_grams_lexicon_dominance['term'] == gram]
            match_valence = three_grams_lexicon_valence[three_grams_lexicon_valence['term'] == gram]
            if not match_valence.empty:
                if pos in matched_positions_valence:
                    continue  # Skip if any word in the 3-gram is part of a matched n-gram
                valence_scores.append(float(match_valence['valence'].values[0]))
                matched_ngrams_valence.append(gram)
                matched_positions_valence_3gram.update(pos)
            if not match_arousal.empty:
                if pos in matched_positions_arousal:
                    continue  # Skip if any word in the 3-gram is part of a matched n-gram
                arousal_scores.append(float(match_arousal['arousal'].values[0]))
                matched_ngrams_arousal.append(gram)
                matched_positions_arousal_3gram.update(pos)
            if not match_dominance.empty:
                if pos in matched_positions_dominance:
                    continue  # Skip if any word in the 3-gram is part of a matched n-gram
                dominance_scores.append(float(match_dominance['dominance'].values[0]))
                matched_ngrams_dominance.append(gram)
                matched_positions_dominance_3gram.update(pos)
        else:
            if not matched_positions.isdisjoint(pos):
                continue  # Skip if any word in the 3-gram is part of a matched n-gram
            match = three_grams_lexicon[three_grams_lexicon['term'] == gram]
            if not match.empty:
                valence_scores.append(float(match['valence'].values[0]))
                arousal_scores.append(float(match['arousal'].values[0]))
                dominance_scores.append(float(match['dominance'].values[0]))
                matched_ngrams.append(gram)
                matched_positions_3gram.update(pos)
    # update the main matched positions sets with the temporary sets for this 3-gram when polar_subset is True
    if polar_subset:
        matched_positions_valence.update(matched_positions_valence_3gram)
        matched_positions_arousal.update(matched_positions_arousal_3gram)
        matched_positions_dominance.update(matched_positions_dominance_3gram)
    else:
        matched_positions.update(matched_positions_3gram)
    
    ## 2-grams
    # check for 2-grams of text in lexicon, excluding words already part of matched 3-grams
    two_grams = list(ngrams(words, 2))
    two_grams = [' '.join(gram) for gram in two_grams]

    # record the positions of words that are part of matched 2-grams
    two_grams_positions = list(ngrams(range(len(words)), 2))
    two_grams_positions = [set(pos) for pos in two_grams_positions]

    if polar_subset:
        # filter lexicon to only polar subsets if polar_subset is True
        two_grams_lexicon_arousal = two_grams_lexicon[(two_grams_lexicon['arousal']<=-0.333) | (two_grams_lexicon['arousal']>=0.333)].copy()
        two_grams_lexicon_dominance = two_grams_lexicon[(two_grams_lexicon['dominance']<=-0.333) | (two_grams_lexicon['dominance']>=0.333)].copy()
        two_grams_lexicon_valence = two_grams_lexicon[(two_grams_lexicon['valence']<=-0.333) | (two_grams_lexicon['valence']>=0.333)].copy()
        # temporary sets to track matched positions for each dimension for this 2-gram when polar_subset is True
        matched_positions_arousal_2gram = set()
        matched_positions_dominance_2gram = set()
        matched_positions_valence_2gram = set()
    else:
        matched_positions_2gram = set()

    # check for matches in 2-grams
    for gram, pos in zip(two_grams, two_grams_positions):        
        if polar_subset:
            match_arousal = two_grams_lexicon_arousal[two_grams_lexicon_arousal['term'] == gram]
            match_dominance = two_grams_lexicon_dominance[two_grams_lexicon_dominance['term'] == gram]
            match_valence = two_grams_lexicon_valence[two_grams_lexicon_valence['term'] == gram]
            if not match_valence.empty:
                if pos in matched_positions_valence:
                    continue  # Skip if any word in the 2-gram is part of a matched n-gram
                valence_scores.append(float(match_valence['valence'].values[0]))
                matched_ngrams_valence.append(gram)
                matched_positions_valence_2gram.update(pos)
            if not match_arousal.empty:
                if pos in matched_positions_arousal:
                    continue  # Skip if any word in the 2-gram is part of a matched n-gram
                arousal_scores.append(float(match_arousal['arousal'].values[0]))
                matched_ngrams_arousal.append(gram)
                matched_positions_arousal_2gram.update(pos)
            if not match_dominance.empty:
                if pos in matched_positions_dominance:
                    continue  # Skip if any word in the 2-gram is part of a matched n-gram
                dominance_scores.append(float(match_dominance['dominance'].values[0]))
                matched_ngrams_dominance.append(gram)
                matched_positions_dominance_2gram.update(pos)
        else:
            if not matched_positions.isdisjoint(pos):
                continue  # Skip if any word in the 2-gram is part of a matched n-gram
            match = two_grams_lexicon[two_grams_lexicon['term'] == gram]
            if not match.empty:
                valence_scores.append(float(match['valence'].values[0]))
                arousal_scores.append(float(match['arousal'].values[0]))
                dominance_scores.append(float(match['dominance'].values[0]))
                matched_ngrams.append(gram)
                matched_positions_2gram.update(pos)
    # update the main matched positions sets with the temporary sets for this 2-gram when polar_subset is True
    if polar_subset:
        matched_positions_valence.update(matched_positions_valence_2gram)
        matched_positions_arousal.update(matched_positions_arousal_2gram)
        matched_positions_dominance.update(matched_positions_dominance_2gram)
    else:
        matched_positions.update(matched_positions_2gram)

    ## 1-grams
    if polar_subset:
        # filter lexicon to only polar subsets if polar_subset is True
        one_gram_lexicon_arousal = one_gram_lexicon[(one_gram_lexicon['arousal']<=-0.333) | (one_gram_lexicon['arousal']>=0.333)].copy()
        one_gram_lexicon_dominance = one_gram_lexicon[(one_gram_lexicon['dominance']<=-0.333) | (one_gram_lexicon['dominance']>=0.333)].copy()
        one_gram_lexicon_valence = one_gram_lexicon[(one_gram_lexicon['valence']<=-0.333) | (one_gram_lexicon['valence']>=0.333)].copy()
        # temporary sets to track matched positions for each dimension for this 1-gram when polar_subset is True
        matched_positions_arousal_1gram = set()
        matched_positions_dominance_1gram = set()
        matched_positions_valence_1gram = set()
    else:
        matched_positions_1gram = set()

    # check for unigrams of text in lexicon, excluding words already part of matched n-grams
    for i, word in enumerate(words):        
        if polar_subset:
            match_arousal = one_gram_lexicon_arousal[one_gram_lexicon_arousal['term'] == word]
            match_dominance = one_gram_lexicon_dominance[one_gram_lexicon_dominance['term'] == word]
            match_valence = one_gram_lexicon_valence[one_gram_lexicon_valence['term'] == word]
            if not match_valence.empty:
                if i in matched_positions_valence:
                    continue  # Skip if the word is part of a matched n-gram
                valence_scores.append(float(match_valence['valence'].values[0]))
                matched_ngrams_valence.append(word)
                matched_positions_valence.add(i)
            if not match_arousal.empty:
                if i in matched_positions_arousal:
                    continue  # Skip if the word is part of a matched n-gram
                arousal_scores.append(float(match_arousal['arousal'].values[0]))
                matched_ngrams_arousal.append(word)
                matched_positions_arousal.add(i)
            if not match_dominance.empty:
                if i in matched_positions_dominance:
                    continue  # Skip if the word is part of a matched n-gram
                dominance_scores.append(float(match_dominance['dominance'].values[0]))
                matched_ngrams_dominance.append(word)
                matched_positions_dominance.add(i)
        else:
            if i in matched_positions:
                continue  # Skip if the word is part of a matched n-gram
            match = one_gram_lexicon[one_gram_lexicon['term'] == word]
            if not match.empty:
                valence_scores.append(float(match['valence'].values[0]))
                arousal_scores.append(float(match['arousal'].values[0]))
                dominance_scores.append(float(match['dominance'].values[0]))
                matched_ngrams.append(word)
                matched_positions.add(i)
                
    # update the main matched positions sets with the temporary sets for this 1-gram when polar_subset is True
    if polar_subset:
        matched_positions_valence.update(matched_positions_valence_1gram)
        matched_positions_arousal.update(matched_positions_arousal_1gram)
        matched_positions_dominance.update(matched_positions_dominance_1gram)
    else:
        matched_positions.update(matched_positions_1gram)

    # Calculate average scores, return None if no scores found
    avg_valence = sum(valence_scores) / len(valence_scores) if valence_scores else None
    avg_arousal = sum(arousal_scores) / len(arousal_scores) if arousal_scores else None
    avg_dominance = sum(dominance_scores) / len(dominance_scores) if dominance_scores else None
    
    return {
        'valence': avg_valence,
        'arousal': avg_arousal,
        'dominance': avg_dominance
    }
    # return matched_ngrams_valence, valence_scores

## `vad2()`

a more efficient vad()

In [525]:
POLAR_THRESH = 0.333  # abs(value) >= threshold is considered polar

def _build_index(df):
    """
    Convert a lexicon DataFrame to a dict: term -> (valence, arousal, dominance).
    Assumes columns: ['term','valence','arousal','dominance'].
    """
    # normalize once to lowercase to match tokenization step
    # if your 'term' is already lowercase, this is still harmless
    return {str(t).lower(): (str(t).lower(), float(v), float(a), float(d))
            for t, v, a, d in zip(df['term'], df['valence'], df['arousal'], df['dominance'])}

def vad2(
    text,
    five_grams_lexicon=five_grams_lexicon,
    four_grams_lexicon=four_grams_lexicon,
    three_grams_lexicon=three_grams_lexicon,
    two_grams_lexicon=two_grams_lexicon,
    one_gram_lexicon=one_gram_lexicon,
    polar_subset=False,
    tokenizer=word_tokenize,
):
    """
    Compute average VAD with longest-match wins across 5→1 grams.
    If polar_subset=True, only include a dimension's score when that dimension is polar for the term.
    """
    # 1) Tokenize once
    words = tokenizer(text.lower())

    # 2) Build fast lookup dicts once (O(n) upfront, O(1) per lookup after)
    idx = {
        5: _build_index(five_grams_lexicon),
        4: _build_index(four_grams_lexicon),
        3: _build_index(three_grams_lexicon),
        2: _build_index(two_grams_lexicon),
        1: _build_index(one_gram_lexicon),
    }

    valence_scores, arousal_scores, dominance_scores = [], [], []
    matched_ngrams, matched_ngrams_valence, matched_ngrams_arousal, matched_ngrams_dominance = [], [], [], []
    if polar_subset:
        matched_positions_valence = set()
        matched_positions_arousal = set()
        matched_positions_dominance = set()
    else:
        matched_positions = set()

    # 3) Sweep n-grams from longest to shortest
    N = len(words)
    for n in (5, 4, 3, 2, 1):
        if N < n:
            continue
        lex = idx[n]
        if polar_subset:
            n_gram_matched_positions_valence = set()
            n_gram_matched_positions_arousal = set()
            n_gram_matched_positions_dominance = set()
        else:
            n_gram_matched_positions = set()
        # slide a window of length n
        for start in range(0, N - n + 1):
            span = range(start, start + n)
            # skip if any token already consumed by a longer match
            if (not polar_subset) and matched_positions.intersection(span):
                continue
            term = ' '.join(words[start:start + n])
            triplet = lex.get(term)
            if triplet is not None:
                t, v, a, d = triplet
                if not polar_subset:
                    valence_scores.append(v)
                    arousal_scores.append(a)
                    dominance_scores.append(d)
                    matched_ngrams.append(t)
                    n_gram_matched_positions.update(span)
                else:
                    if abs(v) >= POLAR_THRESH:
                        if matched_positions_valence.intersection(span):
                            continue  # Skip if any word in the n-gram is part of a matched n-gram
                        valence_scores.append(v)
                        matched_ngrams_valence.append(t)
                        n_gram_matched_positions_valence.update(span)
                    if abs(a) >= POLAR_THRESH:
                        if matched_positions_arousal.intersection(span):
                            continue  # Skip if any word in the n-gram is part of a matched n-gram
                        arousal_scores.append(a)
                        matched_ngrams_arousal.append(t)
                        n_gram_matched_positions_arousal.update(span)
                    if abs(d) >= POLAR_THRESH:
                        if matched_positions_dominance.intersection(span):
                            continue  # Skip if any word in the n-gram is part of a matched n-gram
                        dominance_scores.append(d)
                        matched_ngrams_dominance.append(t)
                        n_gram_matched_positions_dominance.update(span)
        # update matched positions
        if polar_subset:
            matched_positions_valence.update(n_gram_matched_positions_valence)
            matched_positions_arousal.update(n_gram_matched_positions_arousal)
            matched_positions_dominance.update(n_gram_matched_positions_dominance)
        else:
            matched_positions.update(n_gram_matched_positions)

    # 4) Averages (None if empty)
    avg_valence   = (sum(valence_scores) / len(valence_scores)) if valence_scores else None
    avg_arousal   = (sum(arousal_scores) / len(arousal_scores)) if arousal_scores else None
    avg_dominance = (sum(dominance_scores) / len(dominance_scores)) if dominance_scores else None

    return {
        'valence': avg_valence,
        'arousal': avg_arousal,
        'dominance': avg_dominance,
    }
    # return matched_ngrams_valence, valence_scores

## Add VAD Scores and Diffs to `commented_post_authors_edited` (`mental_health_severity_table_1`)

This adds the VAD scores and the difference between post and comment VAD to `commented_post_authors_edited.csv` data. This will then be saved to `mentalh_healt_severity_table_1.csv`

In [524]:
mental_health_severity_table_1 = mental_health_severity_table.copy()
# add VAD scores to mental_health_severity_table
mental_health_severity_table_1[['Post_Valence', 'Post_Arousal', 'Post_Dominance']] = mental_health_severity_table_1['Author_Post'].progress_apply(lambda x: pd.Series(vad2(x)))
mental_health_severity_table_1[['Comment_Valence', 'Comment_Arousal', 'Comment_Dominance']] = mental_health_severity_table_1['Author_Last_Comment'].progress_apply(lambda x: pd.Series(vad2(x)))
# add VAD scores diff between post and comment
mental_health_severity_table_1['Valence_Diff'] = mental_health_severity_table_1['Comment_Valence'] - mental_health_severity_table_1['Post_Valence']
mental_health_severity_table_1['Arousal_Diff'] = mental_health_severity_table_1['Comment_Arousal'] - mental_health_severity_table_1['Post_Arousal']
mental_health_severity_table_1['Dominance_Diff'] = mental_health_severity_table_1['Comment_Dominance'] - mental_health_severity_table_1['Post_Dominance']
mental_health_severity_table_1.head()

  0%|          | 3/8811 [00:00<18:47,  7.81it/s]

Checking 2-gram: 'parental anxiety' at positions [0, 1]
Found triplet: None
Checking 2-gram: 'anxiety .' at positions [1, 2]
Found triplet: None
Checking 2-gram: '. hi' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'hi everyone' at positions [3, 4]
Found triplet: None
Checking 2-gram: 'everyone ,' at positions [4, 5]
Found triplet: None
Checking 2-gram: ', i' at positions [5, 6]
Found triplet: None
Checking 2-gram: 'i am' at positions [6, 7]
Found triplet: None
Checking 2-gram: 'am not' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'not quite' at positions [8, 9]
Found triplet: ('not quite', -0.656, -0.3, -0.516)
Checking 2-gram: 'quite sure' at positions [9, 10]
Found triplet: None
Checking 2-gram: 'sure how' at positions [10, 11]
Found triplet: None
Checking 2-gram: 'how to' at positions [11, 12]
Found triplet: ('how to', 0.134, -0.052, -0.04)
Checking 2-gram: 'to put' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'put this' at positions [13, 14]
Fo

  0%|          | 4/8811 [00:00<20:21,  7.21it/s]

Checking 2-gram: 'new relationship' at positions [0, 1]
Found triplet: None
Checking 2-gram: 'relationship anxiety' at positions [1, 2]
Found triplet: None
Checking 2-gram: 'anxiety .' at positions [2, 3]
Found triplet: None
Checking 2-gram: '. dear' at positions [3, 4]
Found triplet: None
Checking 2-gram: 'dear adjust' at positions [4, 5]
Found triplet: None
Checking 2-gram: 'adjust need' at positions [5, 6]
Found triplet: None
Checking 2-gram: 'need a' at positions [6, 7]
Found triplet: None
Checking 2-gram: 'a connection' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'connection to' at positions [8, 9]
Found triplet: None
Checking 2-gram: 'to feel' at positions [9, 10]
Found triplet: None
Checking 2-gram: 'feel the' at positions [10, 11]
Found triplet: None
Checking 2-gram: 'the support' at positions [11, 12]
Found triplet: None
Checking 2-gram: 'support .' at positions [12, 13]
Found triplet: None
Checking 2-gram: '. thanks' at positions [13, 14]
Found triplet: None
Chec

  0%|          | 8/8811 [00:00<14:12, 10.33it/s]

Found triplet: None
Checking 2-gram: 'best as' at positions [23, 24]
Found triplet: None
Checking 2-gram: 'as i' at positions [24, 25]
Found triplet: None
Checking 2-gram: 'i could' at positions [25, 26]
Found triplet: None
Checking 2-gram: 'could .' at positions [26, 27]
Found triplet: None
Checking 2-gram: '. i' at positions [27, 28]
Found triplet: None
Checking 2-gram: 'i was' at positions [28, 29]
Found triplet: None
Checking 2-gram: 'was diagnosed' at positions [29, 30]
Found triplet: None
Checking 2-gram: 'diagnosed with' at positions [30, 31]
Found triplet: None
Checking 2-gram: 'with gad' at positions [31, 32]
Found triplet: None
Checking 2-gram: 'gad and' at positions [32, 33]
Found triplet: None
Checking 2-gram: 'and its' at positions [33, 34]
Found triplet: None
Checking 2-gram: 'its ,' at positions [34, 35]
Found triplet: None
Checking 2-gram: ', and' at positions [35, 36]
Found triplet: None
Checking 2-gram: 'and i' at positions [36, 37]
Found triplet: None
Checking 2-gram

  0%|          | 12/8811 [00:01<12:50, 11.43it/s]

Checking 2-gram: 'anxiety and' at positions [0, 1]
Found triplet: None
Checking 2-gram: 'and old' at positions [1, 2]
Found triplet: None
Checking 2-gram: 'old thoughts' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'thoughts about' at positions [3, 4]
Found triplet: None
Checking 2-gram: 'about past' at positions [4, 5]
Found triplet: None
Checking 2-gram: 'past .' at positions [5, 6]
Found triplet: None
Checking 2-gram: '. hi' at positions [6, 7]
Found triplet: None
Checking 2-gram: 'hi all' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'all and' at positions [8, 9]
Found triplet: None
Checking 2-gram: 'and sorry' at positions [9, 10]
Found triplet: None
Checking 2-gram: 'sorry in' at positions [10, 11]
Found triplet: None
Checking 2-gram: 'in advance' at positions [11, 12]
Found triplet: ('in advance', 0.354, -0.258, 0.13)
Checking 2-gram: 'advance for' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'for my' at positions [13, 14]
Found triplet: None

  0%|          | 14/8811 [00:01<12:49, 11.44it/s]

Checking 2-gram: 'health anxiety' at positions [0, 1]
Found triplet: None
Checking 2-gram: 'anxiety about' at positions [1, 2]
Found triplet: None
Checking 2-gram: 'about teeth' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'teeth .' at positions [3, 4]
Found triplet: None
Checking 2-gram: '. hi' at positions [4, 5]
Found triplet: None
Checking 2-gram: 'hi ,' at positions [5, 6]
Found triplet: None
Checking 2-gram: ', i' at positions [6, 7]
Found triplet: None
Checking 2-gram: 'i have' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'have always' at positions [8, 9]
Found triplet: None
Checking 2-gram: 'always had' at positions [9, 10]
Found triplet: None
Checking 2-gram: 'had anxiety' at positions [10, 11]
Found triplet: None
Checking 2-gram: 'anxiety but' at positions [11, 12]
Found triplet: None
Checking 2-gram: 'but in' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'in the' at positions [13, 14]
Found triplet: None
Checking 2-gram: 'the last' at pos

  0%|          | 18/8811 [00:01<12:09, 12.05it/s]

Checking 2-gram: 'intrusive thought' at positions [0, 1]
Found triplet: None
Checking 2-gram: 'thought .' at positions [1, 2]
Found triplet: None
Checking 2-gram: '. i' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'i am' at positions [3, 4]
Found triplet: None
Checking 2-gram: 'am in' at positions [4, 5]
Found triplet: None
Checking 2-gram: 'in my' at positions [5, 6]
Found triplet: None
Checking 2-gram: 'my mid' at positions [6, 7]
Found triplet: None
Checking 2-gram: 'mid 20s' at positions [7, 8]
Found triplet: None
Checking 2-gram: '20s and' at positions [8, 9]
Found triplet: None
Checking 2-gram: 'and i' at positions [9, 10]
Found triplet: None
Checking 2-gram: 'i only' at positions [10, 11]
Found triplet: None
Checking 2-gram: 'only just' at positions [11, 12]
Found triplet: None
Checking 2-gram: 'just started' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'started having' at positions [13, 14]
Found triplet: None
Checking 2-gram: 'having hurtful' at posit

  0%|          | 22/8811 [00:02<12:37, 11.60it/s]

Checking 2-gram: 'serious fatigue' at positions [0, 1]
Found triplet: None
Checking 2-gram: 'fatigue .' at positions [1, 2]
Found triplet: None
Checking 2-gram: '. hi' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'hi all' at positions [3, 4]
Found triplet: None
Checking 2-gram: 'all ,' at positions [4, 5]
Found triplet: None
Checking 2-gram: ', i' at positions [5, 6]
Found triplet: None
Checking 2-gram: 'i am' at positions [6, 7]
Found triplet: None
Checking 2-gram: 'am trying' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'trying to' at positions [8, 9]
Found triplet: None
Checking 2-gram: 'to get' at positions [9, 10]
Found triplet: None
Checking 2-gram: 'get this' at positions [10, 11]
Found triplet: None
Checking 2-gram: 'this down' at positions [11, 12]
Found triplet: None
Checking 2-gram: 'down although' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'although even' at positions [13, 14]
Found triplet: None
Checking 2-gram: 'even moving' at posi

  0%|          | 24/8811 [00:02<12:47, 11.45it/s]

Checking 2-gram: 'has anyone' at positions [0, 1]
Found triplet: None
Checking 2-gram: 'anyone been' at positions [1, 2]
Found triplet: None
Checking 2-gram: 'been successful' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'successful with' at positions [3, 4]
Found triplet: None
Checking 2-gram: 'with a' at positions [4, 5]
Found triplet: None
Checking 2-gram: 'a return' at positions [5, 6]
Found triplet: None
Checking 2-gram: 'return to' at positions [6, 7]
Found triplet: ('return to', -0.334, -0.046, -0.354)
Checking 2-gram: 'to work' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'work a' at positions [8, 9]
Found triplet: None
Checking 2-gram: 'a claim' at positions [9, 10]
Found triplet: None
Checking 2-gram: 'claim for' at positions [10, 11]
Found triplet: None
Checking 2-gram: 'for anxiety' at positions [11, 12]
Found triplet: None
Checking 2-gram: 'anxiety caused' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'caused by' at positions [13, 14]
F

  0%|          | 28/8811 [00:02<12:24, 11.79it/s]

Checking 2-gram: 'how can' at positions [0, 1]
Found triplet: None
Checking 2-gram: 'can i' at positions [1, 2]
Found triplet: None
Checking 2-gram: 'i let' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'let my' at positions [3, 4]
Found triplet: None
Checking 2-gram: 'my mum' at positions [4, 5]
Found triplet: None
Checking 2-gram: 'mum move' at positions [5, 6]
Found triplet: None
Checking 2-gram: 'move out' at positions [6, 7]
Found triplet: ('move out', -0.5, 0.172, -0.13)
Checking 2-gram: 'out of' at positions [7, 8]
Found triplet: ('out of', -0.314, -0.192, -0.334)
Checking 2-gram: 'of my' at positions [8, 9]
Found triplet: None
Checking 2-gram: 'my house' at positions [9, 10]
Found triplet: None
Checking 2-gram: 'house ?' at positions [10, 11]
Found triplet: None
Checking 2-gram: '? .' at positions [11, 12]
Found triplet: None
Checking 2-gram: '. because' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'because i' at positions [13, 14]
Found triplet: None
C

  0%|          | 32/8811 [00:02<13:40, 10.70it/s]

Checking 2-gram: 'i do' at positions [0, 1]
Found triplet: None
Checking 2-gram: 'do not' at positions [1, 2]
Found triplet: None
Checking 2-gram: 'not feel' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'feel like' at positions [3, 4]
Found triplet: ('feel like', 0.2, -0.22, 0.038)
Checking 2-gram: 'like me' at positions [4, 5]
Found triplet: None
Checking 2-gram: 'me anymore' at positions [5, 6]
Found triplet: None
Checking 2-gram: 'anymore .' at positions [6, 7]
Found triplet: None
Checking 2-gram: '. hey' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'hey i' at positions [8, 9]
Found triplet: None
Checking 2-gram: 'i am' at positions [9, 10]
Found triplet: None
Checking 2-gram: 'am dee' at positions [10, 11]
Found triplet: None
Checking 2-gram: 'dee ,' at positions [11, 12]
Found triplet: None
Checking 2-gram: ', i' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'i actually' at positions [13, 14]
Found triplet: None
Checking 2-gram: 'actually am' a

  0%|          | 34/8811 [00:03<13:29, 10.84it/s]

Checking 2-gram: 'extremely overwhelmed' at positions [0, 1]
Found triplet: None
Checking 2-gram: 'overwhelmed and' at positions [1, 2]
Found triplet: None
Checking 2-gram: 'and under' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'under pressure-wanting' at positions [3, 4]
Found triplet: None
Checking 2-gram: 'pressure-wanting to' at positions [4, 5]
Found triplet: None
Checking 2-gram: 'to be' at positions [5, 6]
Found triplet: None
Checking 2-gram: 'be left' at positions [6, 7]
Found triplet: None
Checking 2-gram: 'left in' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'in peace' at positions [8, 9]
Found triplet: ('in peace', 0.536, -0.412, 0.286)
Checking 2-gram: 'peace .' at positions [9, 10]
Found triplet: None
Checking 2-gram: '. hi' at positions [10, 11]
Found triplet: None
Checking 2-gram: 'hi everyone' at positions [11, 12]
Found triplet: None
Checking 2-gram: 'everyone i' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'i am' at positions [

  0%|          | 38/8811 [00:03<14:06, 10.36it/s]

Checking 2-gram: 'my story' at positions [0, 1]
Found triplet: None
Checking 2-gram: 'story and' at positions [1, 2]
Found triplet: None
Checking 2-gram: 'and no' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'no help' at positions [3, 4]
Found triplet: None
Checking 2-gram: 'help .' at positions [4, 5]
Found triplet: None
Checking 2-gram: '. and' at positions [5, 6]
Found triplet: None
Checking 2-gram: 'and hated' at positions [6, 7]
Found triplet: None
Checking 2-gram: 'hated for' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'for it' at positions [8, 9]
Found triplet: None
Checking 2-gram: 'it ,' at positions [9, 10]
Found triplet: None
Checking 2-gram: ', this' at positions [10, 11]
Found triplet: None
Checking 2-gram: 'this is' at positions [11, 12]
Found triplet: None
Checking 2-gram: 'is my' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'my story' at positions [13, 14]
Found triplet: None
Checking 2-gram: 'story .' at positions [14, 15]
Found t

  0%|          | 42/8811 [00:03<13:02, 11.21it/s]

Checking 2-gram: 'anxiety .' at positions [0, 1]
Found triplet: None
Checking 2-gram: '. hello' at positions [1, 2]
Found triplet: None
Checking 2-gram: 'hello i' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'i struggle' at positions [3, 4]
Found triplet: None
Checking 2-gram: 'struggle with' at positions [4, 5]
Found triplet: None
Checking 2-gram: 'with very' at positions [5, 6]
Found triplet: None
Checking 2-gram: 'very bad' at positions [6, 7]
Found triplet: None
Checking 2-gram: 'bad anxietyand' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'anxietyand panic' at positions [8, 9]
Found triplet: None
Checking 2-gram: 'panic attacks' at positions [9, 10]
Found triplet: None
Checking 2-gram: 'attacks i' at positions [10, 11]
Found triplet: None
Checking 2-gram: 'i finally' at positions [11, 12]
Found triplet: None
Checking 2-gram: 'finally have' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'have a' at positions [13, 14]
Found triplet: None
Checking 

  0%|          | 44/8811 [00:04<12:50, 11.38it/s]

Checking 2-gram: 'just need' at positions [0, 1]
Found triplet: None
Checking 2-gram: 'need to' at positions [1, 2]
Found triplet: ('need to', 0.162, -0.046, -0.138)
Checking 2-gram: 'to put' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'put it' at positions [3, 4]
Found triplet: ('put it', -0.096, -0.18, -0.156)
Checking 2-gram: 'it out' at positions [4, 5]
Found triplet: None
Checking 2-gram: 'out there' at positions [5, 6]
Found triplet: ('out there', -0.366, -0.09, -0.428)
Checking 2-gram: 'there .' at positions [6, 7]
Found triplet: None
Checking 2-gram: '. hi' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'hi ,' at positions [8, 9]
Found triplet: None
Checking 2-gram: ', sorry' at positions [9, 10]
Found triplet: None
Checking 2-gram: 'sorry i' at positions [10, 11]
Found triplet: None
Checking 2-gram: 'i have' at positions [11, 12]
Found triplet: None
Checking 2-gram: 'have never' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'never really' at

  1%|          | 48/8811 [00:04<12:58, 11.25it/s]

Checking 2-gram: 'scared again' at positions [0, 1]
Found triplet: None
Checking 2-gram: 'again and' at positions [1, 2]
Found triplet: None
Checking 2-gram: 'and do' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'do not' at positions [3, 4]
Found triplet: None
Checking 2-gram: 'not know' at positions [4, 5]
Found triplet: None
Checking 2-gram: 'know where' at positions [5, 6]
Found triplet: None
Checking 2-gram: 'where i' at positions [6, 7]
Found triplet: None
Checking 2-gram: 'i am' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'am headed' at positions [8, 9]
Found triplet: None
Checking 2-gram: 'headed .' at positions [9, 10]
Found triplet: None
Checking 2-gram: '. hi' at positions [10, 11]
Found triplet: None
Checking 2-gram: 'hi and' at positions [11, 12]
Found triplet: None
Checking 2-gram: 'and best' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'best wishes' at positions [13, 14]
Found triplet: ('best wishes', 0.976, 0.082, 0.482)
Checking 2-

  1%|          | 50/8811 [00:04<13:11, 11.06it/s]

Checking 2-gram: 'work related' at positions [0, 1]
Found triplet: None
Checking 2-gram: 'related stress' at positions [1, 2]
Found triplet: None
Checking 2-gram: 'stress working' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'working in' at positions [3, 4]
Found triplet: None
Checking 2-gram: 'in defense' at positions [4, 5]
Found triplet: None
Checking 2-gram: 'defense office😳😰' at positions [5, 6]
Found triplet: None
Checking 2-gram: 'office😳😰 .' at positions [6, 7]
Found triplet: None
Checking 2-gram: '. hi' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'hi ,' at positions [8, 9]
Found triplet: None
Checking 2-gram: ', i' at positions [9, 10]
Found triplet: None
Checking 2-gram: 'i have' at positions [10, 11]
Found triplet: None
Checking 2-gram: 'have not' at positions [11, 12]
Found triplet: ('have not', -0.6, -0.242, -0.556)
Checking 2-gram: 'not been' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'been on' at positions [13, 14]
Found triplet: 

  1%|          | 54/8811 [00:04<13:14, 11.03it/s]

Checking 2-gram: 'feeling overwhelmed' at positions [0, 1]
Found triplet: None
Checking 2-gram: 'overwhelmed .' at positions [1, 2]
Found triplet: None
Checking 2-gram: '. i' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'i am' at positions [3, 4]
Found triplet: None
Checking 2-gram: 'am feeling' at positions [4, 5]
Found triplet: None
Checking 2-gram: 'feeling very' at positions [5, 6]
Found triplet: None
Checking 2-gram: 'very depressed' at positions [6, 7]
Found triplet: None
Checking 2-gram: 'depressed and' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'and spent' at positions [8, 9]
Found triplet: None
Checking 2-gram: 'spent last' at positions [9, 10]
Found triplet: None
Checking 2-gram: 'last 2' at positions [10, 11]
Found triplet: None
Checking 2-gram: '2 days' at positions [11, 12]
Found triplet: None
Checking 2-gram: 'days in' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'in bed' at positions [13, 14]
Found triplet: ('in bed', 0.138, -0.456

  1%|          | 56/8811 [00:05<13:45, 10.61it/s]

Checking 2-gram: 'new job' at positions [0, 1]
Found triplet: ('new job', 0.51, 0.286, 0.458)
Checking 2-gram: 'job opportunity' at positions [1, 2]
Found triplet: None
Checking 2-gram: 'opportunity anxiety' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'anxiety .' at positions [3, 4]
Found triplet: None
Checking 2-gram: '. i' at positions [4, 5]
Found triplet: None
Checking 2-gram: 'i work' at positions [5, 6]
Found triplet: None
Checking 2-gram: 'work at' at positions [6, 7]
Found triplet: ('work at', 0.29, 0.066, 0.194)
Checking 2-gram: 'at a' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'a school' at positions [8, 9]
Found triplet: None
Checking 2-gram: 'school (' at positions [9, 10]
Found triplet: None
Checking 2-gram: '( not' at positions [10, 11]
Found triplet: None
Checking 2-gram: 'not a' at positions [11, 12]
Found triplet: None
Checking 2-gram: 'a teacher' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'teacher )' at positions [13, 14]
Fou

  1%|          | 60/8811 [00:05<12:40, 11.50it/s]

Checking 2-gram: 'new year' at positions [0, 1]
Found triplet: ('new year', 0.532, -0.246, 0.438)
Checking 2-gram: 'year not' at positions [1, 2]
Found triplet: None
Checking 2-gram: 'not so' at positions [2, 3]
Found triplet: ('not so', -0.518, -0.334, -0.484)
Checking 2-gram: 'so good' at positions [3, 4]
Found triplet: None
Checking 2-gram: 'good .' at positions [4, 5]
Found triplet: None
Checking 2-gram: '. hi' at positions [5, 6]
Found triplet: None
Checking 2-gram: 'hi ,' at positions [6, 7]
Found triplet: None
Checking 2-gram: ', i' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'i have' at positions [8, 9]
Found triplet: None
Checking 2-gram: 'have chronic' at positions [9, 10]
Found triplet: None
Checking 2-gram: 'chronic anxiety' at positions [10, 11]
Found triplet: None
Checking 2-gram: 'anxiety and' at positions [11, 12]
Found triplet: None
Checking 2-gram: 'and my' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'my triggers' at positions [13, 14]
Foun

  1%|          | 62/8811 [00:05<13:10, 11.07it/s]

Checking 2-gram: 'help ...' at positions [0, 1]
Found triplet: None
Checking 2-gram: '... internal' at positions [1, 2]
Found triplet: None
Checking 2-gram: 'internal tremors/buzzing' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'tremors/buzzing .' at positions [3, 4]
Found triplet: None
Checking 2-gram: '. hey' at positions [4, 5]
Found triplet: None
Checking 2-gram: 'hey all' at positions [5, 6]
Found triplet: None
Checking 2-gram: 'all ...' at positions [6, 7]
Found triplet: None
Checking 2-gram: '... i' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'i have' at positions [8, 9]
Found triplet: None
Checking 2-gram: 'have been' at positions [9, 10]
Found triplet: None
Checking 2-gram: 'been getting' at positions [10, 11]
Found triplet: None
Checking 2-gram: 'getting this' at positions [11, 12]
Found triplet: None
Checking 2-gram: 'this really' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'really odd' at positions [13, 14]
Found triplet: None
Checki

  1%|          | 66/8811 [00:05<12:06, 12.05it/s]

Checking 2-gram: 'anxiety+comparing oneself' at positions [0, 1]
Found triplet: None
Checking 2-gram: 'oneself .' at positions [1, 2]
Found triplet: None
Checking 2-gram: '. hi' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'hi all' at positions [3, 4]
Found triplet: None
Checking 2-gram: 'all ,' at positions [4, 5]
Found triplet: None
Checking 2-gram: ', i' at positions [5, 6]
Found triplet: None
Checking 2-gram: 'i just' at positions [6, 7]
Found triplet: None
Checking 2-gram: 'just wanted' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'wanted another' at positions [8, 9]
Found triplet: None
Checking 2-gram: 'another person' at positions [9, 10]
Found triplet: None
Checking 2-gram: 'person 's' at positions [10, 11]
Found triplet: None
Checking 2-gram: ''s opinion' at positions [11, 12]
Found triplet: None
Checking 2-gram: 'opinion on' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'on how' at positions [13, 14]
Found triplet: None
Checking 2-gram: 'h

  1%|          | 70/8811 [00:06<11:56, 12.19it/s]

Checking 2-gram: 'struggling to' at positions [0, 1]
Found triplet: None
Checking 2-gram: 'to cope' at positions [1, 2]
Found triplet: None
Checking 2-gram: 'cope in' at positions [2, 3]
Found triplet: None
Checking 2-gram: 'in a' at positions [3, 4]
Found triplet: None
Checking 2-gram: 'a high' at positions [4, 5]
Found triplet: None
Checking 2-gram: 'high pressure' at positions [5, 6]
Found triplet: ('high pressure', -0.516, 0.358, 0.266)
Checking 2-gram: 'pressure work' at positions [6, 7]
Found triplet: None
Checking 2-gram: 'work environment' at positions [7, 8]
Found triplet: None
Checking 2-gram: 'environment .' at positions [8, 9]
Found triplet: None
Checking 2-gram: '. hi' at positions [9, 10]
Found triplet: None
Checking 2-gram: 'hi guys' at positions [10, 11]
Found triplet: None
Checking 2-gram: 'guys ,' at positions [11, 12]
Found triplet: None
Checking 2-gram: ', has' at positions [12, 13]
Found triplet: None
Checking 2-gram: 'has anyone' at positions [13, 14]
Found triple

  1%|          | 70/8811 [00:06<13:10, 11.05it/s]


KeyboardInterrupt: 

In [None]:
mental_health_severity_table_1.to_csv('./data/BeyondBlue/mental_health_severity_table_1.csv', index=False)

## Add VAD Scores and Diffs to `commented_post_authors_max14days` (`mental_health_severity_table_2`)

In [None]:
mental_health_severity_table_2 = mental_health_severity_table_max14days.copy()
# add VAD scores to mental_health_severity_table
mental_health_severity_table_2[['Post_Valence', 'Post_Arousal', 'Post_Dominance']] = mental_health_severity_table_2['Author_Post'].progress_apply(lambda x: pd.Series(vad2(x)))
mental_health_severity_table_2[['Comment_Valence', 'Comment_Arousal', 'Comment_Dominance']] = mental_health_severity_table_2['Author_Last_Comment'].progress_apply(lambda x: pd.Series(vad2(x)))
# add VAD scores diff between post and comment
mental_health_severity_table_2['Valence_Diff'] = mental_health_severity_table_2['Comment_Valence'] - mental_health_severity_table_2['Post_Valence']
mental_health_severity_table_2['Arousal_Diff'] = mental_health_severity_table_2['Comment_Arousal'] - mental_health_severity_table_2['Post_Arousal']
mental_health_severity_table_2['Dominance_Diff'] = mental_health_severity_table_2['Comment_Dominance'] - mental_health_severity_table_2['Post_Dominance']
mental_health_severity_table_2.head()

100%|██████████| 8387/8387 [3:17:19<00:00,  1.41s/it]   
100%|██████████| 8387/8387 [1:12:24<00:00,  1.93it/s]


Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,...,Days_Between,Post_Valence,Post_Arousal,Post_Dominance,Comment_Valence,Comment_Arousal,Comment_Dominance,Valence_Diff,Arousal_Diff,Dominance_Diff
0,Anxi-6,Amenace,"Parental Anxiety. Hi Everyone, I am not quite ...",Thank you Indigo. I am seeing a counsellor ton...,0,0,0,1,0,0,...,0,0.1145,-0.0034,0.08986,0.241185,-0.050889,0.179852,0.126685,-0.047489,0.089992
1,Anxi-12,Whatsinaname,"Horrible week. Hi everyone, I am having a horr...","Hi Petal, Thanks again for posting, your posit...",0,2,12,0,0,0,...,9,0.08281,0.002881,-0.062214,0.207382,-0.059235,0.034176,0.124573,-0.062116,0.096391
2,Anxi-13,Sammy,New relationship anxiety. Dear adjust need a c...,"Thanks ranges I told him, and he apologized an...",0,0,0,0,2,0,...,1,0.149161,-0.049231,0.044842,0.129939,-0.065273,-0.015879,-0.019222,-0.016042,-0.060721
3,Anxi-17,Olive83,Easy strategies for quick response. Recently m...,You are not wrong Caught between! And I did ex...,0,0,0,1,4,0,...,8,0.117859,0.005094,0.033594,0.104344,-0.036,0.051844,-0.013516,-0.041094,0.01825
4,Anxi-34,Gobble,Any idea?. I need a little help. On a generali...,She then spread rumors horrid things that coul...,0,0,0,2,0,0,...,2,0.080487,-0.005872,0.044974,0.008595,-0.036451,-0.031768,-0.071892,-0.03058,-0.076742


In [None]:
mental_health_severity_table_2.to_csv('./data/BeyondBlue/mental_health_severity_table_2.csv', index=False)

## Add VAD Scores and Diffs to `commented_post_authors_edited` with only polar subsets of VAD lexicon (`mental_health_severity_table_3`)

In [None]:
mental_health_severity_table_3 = mental_health_severity_table.copy()
# add VAD scores to mental_health_severity_table
mental_health_severity_table_3[['Post_Valence', 'Post_Arousal', 'Post_Dominance']] = mental_health_severity_table_3['Author_Post'].progress_apply(lambda x: pd.Series(vad2(x, polar_subset=True)))
mental_health_severity_table_3[['Comment_Valence', 'Comment_Arousal', 'Comment_Dominance']] = mental_health_severity_table_3['Author_Last_Comment'].progress_apply(lambda x: pd.Series(vad2(x, polar_subset=True)))
# add VAD scores diff between post and comment
mental_health_severity_table_3['Valence_Diff'] = mental_health_severity_table_3['Comment_Valence'] - mental_health_severity_table_3['Post_Valence']
mental_health_severity_table_3['Arousal_Diff'] = mental_health_severity_table_3['Comment_Arousal'] - mental_health_severity_table_3['Post_Arousal']
mental_health_severity_table_3['Dominance_Diff'] = mental_health_severity_table_3['Comment_Dominance'] - mental_health_severity_table_3['Post_Dominance']
mental_health_severity_table_3

  0%|          | 0/8811 [00:00<?, ?it/s]

100%|██████████| 8811/8811 [25:56<00:00,  5.66it/s]  
100%|██████████| 8811/8811 [20:11<00:00,  7.27it/s]


Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,...,Days_Between,Post_Valence,Post_Arousal,Post_Dominance,Comment_Valence,Comment_Arousal,Comment_Dominance,Valence_Diff,Arousal_Diff,Dominance_Diff
0,Anxi-6,Amenace,"Parental Anxiety. Hi Everyone, I am not quite ...",Thank you Indigo. I am seeing a counsellor ton...,0,0,0,1,0,0,...,0,0.215000,0.094875,0.457417,0.516000,-0.101833,0.542400,0.301000,-0.196708,0.084983
1,Anxi-12,Whatsinaname,"Horrible week. Hi everyone, I am having a horr...",Since my last post I ended up quitting my job....,0,2,28,3,8,0,...,1765,0.230077,0.131000,-0.100500,0.057167,0.798000,0.076714,-0.172910,0.667000,0.177214
2,Anxi-13,Sammy,New relationship anxiety. Dear adjust need a c...,Such a good idea this too shallpass thanks for...,0,0,0,0,2,0,...,161,0.304902,-0.058094,0.111380,0.791889,-0.248667,0.580000,0.486987,-0.190573,0.468620
3,Anxi-17,Olive83,Easy strategies for quick response. Recently m...,You are not wrong Caught between! And I did ex...,0,0,0,1,4,0,...,8,0.298565,0.101250,0.218429,0.272900,-0.075500,0.191333,-0.025665,-0.176750,-0.027095
4,Anxi-28,jordan,"Vomiting, GAD, herd. Hello everyone, I have ha...",I am replying after a year. Thank you for your...,0,0,0,0,1,1,...,359,0.092662,-0.063250,-0.024265,0.656857,-0.441333,0.512000,0.564195,-0.378083,0.536265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8806,Suic-1308,Apple2468,Confused looking for support. Hello Since a ch...,Thanks for sharing those links with me. Readin...,0,0,1,0,0,1,...,0,0.163661,0.086716,0.118117,0.441727,-0.118000,0.046400,0.278066,-0.204716,-0.071717
8807,Suic-1309,jujusbizarrecircus,Intrusive thoughts. For the past two years I h...,"Hey Joseph, I am actually making a homebred ca...",0,0,0,2,1,1,...,2,0.179409,0.084000,0.061593,0.419286,0.051333,0.280800,0.239877,-0.032667,0.219207
8808,Suic-1310,Teegs_,I do not know how to keep living. This has bee...,"Thank you for your replies, I really appreciat...",0,0,0,0,2,1,...,30,0.222210,-0.178824,-0.040791,0.225152,0.017000,0.258750,0.002942,0.195824,0.299541
8809,Suic-1313,lizzie50,13 Reasons Why. I am not sure if anyone has se...,Thanks for the reply Mary! The people I work w...,0,0,3,3,3,0,...,23,0.053207,0.130439,0.061636,0.251277,-0.063269,0.105086,0.198071,-0.193708,0.043450


In [None]:
mental_health_severity_table_3.to_csv('./data/BeyondBlue/mental_health_severity_table_3.csv', index=False)

## Add VAD Scores and Diffs to `commented_post_authors_max14days` with only polar subsets of VAD lexicon (`mental_health_severity_table_4`)

In [None]:
mental_health_severity_table_4 = mental_health_severity_table_max14days.copy()
# add VAD scores to mental_health_severity_table
mental_health_severity_table_4[['Post_Valence', 'Post_Arousal', 'Post_Dominance']] = mental_health_severity_table_4['Author_Post'].progress_apply(lambda x: pd.Series(vad2(x, polar_subset=True)))
mental_health_severity_table_4[['Comment_Valence', 'Comment_Arousal', 'Comment_Dominance']] = mental_health_severity_table_4['Author_Last_Comment'].progress_apply(lambda x: pd.Series(vad2(x, polar_subset=True)))
# add VAD scores diff between post and comment
mental_health_severity_table_4['Valence_Diff'] = mental_health_severity_table_4['Comment_Valence'] - mental_health_severity_table_4['Post_Valence']
mental_health_severity_table_4['Arousal_Diff'] = mental_health_severity_table_4['Comment_Arousal'] - mental_health_severity_table_4['Post_Arousal']
mental_health_severity_table_4['Dominance_Diff'] = mental_health_severity_table_4['Comment_Dominance'] - mental_health_severity_table_4['Post_Dominance']
mental_health_severity_table_4.head()

# Testings

## Generate VAD Scores and Diffs

In [549]:
sample_mental_health_severity_table_1 = mental_health_severity_table.sample(50, random_state=42).copy()
# add VAD scores to mental_health_severity_table
sample_mental_health_severity_table_1[['Post_Valence', 'Post_Arousal', 'Post_Dominance']] = sample_mental_health_severity_table_1['Author_Post'].apply(lambda x: pd.Series(vad(x, polar_subset=True)))  
sample_mental_health_severity_table_1[['Comment_Valence', 'Comment_Arousal', 'Comment_Dominance']] = sample_mental_health_severity_table_1['Author_Last_Comment'].apply(lambda x: pd.Series(vad(x, polar_subset=True)))
# add VAD scores diff between post and comment
# sample_mental_health_severity_table['Valence_Diff'] = sample_mental_health_severity_table['Comment_Valence'] - sample_mental_health_severity_table['Post_Valence']
# sample_mental_health_severity_table['Arousal_Diff'] = sample_mental_health_severity_table['Comment_Arousal'] - sample_mental_health_severity_table['Post_Arousal']
# sample_mental_health_severity_table['Dominance_Diff'] = sample_mental_health_severity_table['Comment_Dominance'] - sample_mental_health_severity_table['Post_Dominance']
sample_mental_health_severity_table_1

Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,Valued Contributor,Days_Between,Post_Valence,Post_Arousal,Post_Dominance,Comment_Valence,Comment_Arousal,Comment_Dominance
3969,Depr-389,Richard,Teenage silence. Looking for advice for dealin...,Thankyou for your support and the fresh perspe...,0,0,0,0,0,0,1,7,0.421652,-0.186611,0.250824,0.514,-0.132545,0.198143
6433,Depr-5359,Dalro45,Something wrong with my head. I know that I su...,lot I just re-read what I wrote and it makes n...,0,0,0,0,0,0,0,0,0.341195,-0.112896,0.070968,0.619667,-0.6185,-0.0475
6358,Depr-5225,Arbutus,Struggling. Here I go-my first post. I have be...,"Thank you all for your kind replies, and I apo...",0,0,2,0,1,0,0,5,0.252878,-0.168132,0.121038,0.213008,-0.114395,0.075488
221,Anxi-515,Harriolo,Overwhelmed with tasks. Does anyone else spend...,Thanks for your response. Nice to see that som...,0,0,0,1,1,0,0,4,0.364224,-0.069143,0.151333,0.605833,-0.010571,0.215833
8225,PTSD-2040,OUT_OF_FIGHT,New to Beyond Blue. I am a 58yr old woman with...,"Hi Neil, Thank you so much for your reply, I b...",0,0,0,0,1,0,0,1,0.127354,-0.127839,0.043524,0.161,-0.197038,0.080339
3934,Depr-287,Supermum,Something needs to change. It is being a long ...,Thankyou to both of you for your replies and f...,0,0,0,0,1,0,2,4,0.136747,0.061923,0.11602,0.229152,-0.068206,0.224659
7171,PTSD-39,Rowen13,Unrequited friendships or low self-esteem?. Ho...,"Hi rx, I think the people we are drawn to, and...",0,0,0,0,4,0,21,26,0.339813,-0.090071,0.254957,0.311455,-0.094875,0.137371
5741,Depr-4187,OU812,"New Girlfriend depression, shuts down. Hi guys...",Thanks guys for the nice comments. I just feel...,0,1,1,0,0,0,0,0,0.299256,-0.035162,0.062818,0.3795,0.029167,0.282182
1374,Anxi-2907,Unsureonlife,"Quit job due to mental health-centreing. Hi, I...","Hi smallwolf, I recently had an and diagnosis ...",0,0,1,1,0,0,0,0,0.453286,0.071143,0.018846,0.366578,-0.060682,0.14025
7754,PTSD-1193,Guest5643,I Am a little on edge today. Hi I Am a little ...,Hi monkeymagic and islandwiz When I said swat ...,0,0,0,1,1,0,1,0,0.120179,-0.02331,-0.121783,0.385143,0.263375,0.235


In [551]:
sample_mental_health_severity_table_2 = mental_health_severity_table.sample(50, random_state=42).copy()
# add VAD scores to mental_health_severity_table
sample_mental_health_severity_table_2[['Post_Valence', 'Post_Arousal', 'Post_Dominance']] = sample_mental_health_severity_table_2['Author_Post'].apply(lambda x: pd.Series(vad2(x, polar_subset=True)))  
sample_mental_health_severity_table_2[['Comment_Valence', 'Comment_Arousal', 'Comment_Dominance']] = sample_mental_health_severity_table_2['Author_Last_Comment'].apply(lambda x: pd.Series(vad2(x, polar_subset=True)))
# add VAD scores diff between post and comment
# sample_mental_health_severity_table['Valence_Diff'] = sample_mental_health_severity_table['Comment_Valence'] - sample_mental_health_severity_table['Post_Valence']
# sample_mental_health_severity_table['Arousal_Diff'] = sample_mental_health_severity_table['Comment_Arousal'] - sample_mental_health_severity_table['Post_Arousal']
# sample_mental_health_severity_table['Dominance_Diff'] = sample_mental_health_severity_table['Comment_Dominance'] - sample_mental_health_severity_table['Post_Dominance']
sample_mental_health_severity_table_2

Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,Valued Contributor,Days_Between,Post_Valence,Post_Arousal,Post_Dominance,Comment_Valence,Comment_Arousal,Comment_Dominance
3969,Depr-389,Richard,Teenage silence. Looking for advice for dealin...,Thankyou for your support and the fresh perspe...,0,0,0,0,0,0,1,7,0.421652,-0.186611,0.250824,0.514,-0.132545,0.198143
6433,Depr-5359,Dalro45,Something wrong with my head. I know that I su...,lot I just re-read what I wrote and it makes n...,0,0,0,0,0,0,0,0,0.341195,-0.112896,0.070968,0.619667,-0.6185,-0.0475
6358,Depr-5225,Arbutus,Struggling. Here I go-my first post. I have be...,"Thank you all for your kind replies, and I apo...",0,0,2,0,1,0,0,5,0.252878,-0.168132,0.121038,0.213008,-0.114395,0.075488
221,Anxi-515,Harriolo,Overwhelmed with tasks. Does anyone else spend...,Thanks for your response. Nice to see that som...,0,0,0,1,1,0,0,4,0.364224,-0.069143,0.151333,0.605833,-0.010571,0.215833
8225,PTSD-2040,OUT_OF_FIGHT,New to Beyond Blue. I am a 58yr old woman with...,"Hi Neil, Thank you so much for your reply, I b...",0,0,0,0,1,0,0,1,0.127354,-0.127839,0.043524,0.161,-0.197038,0.080339
3934,Depr-287,Supermum,Something needs to change. It is being a long ...,Thankyou to both of you for your replies and f...,0,0,0,0,1,0,2,4,0.136747,0.061923,0.11602,0.229152,-0.068206,0.224659
7171,PTSD-39,Rowen13,Unrequited friendships or low self-esteem?. Ho...,"Hi rx, I think the people we are drawn to, and...",0,0,0,0,4,0,21,26,0.339813,-0.090071,0.254957,0.311455,-0.094875,0.137371
5741,Depr-4187,OU812,"New Girlfriend depression, shuts down. Hi guys...",Thanks guys for the nice comments. I just feel...,0,1,1,0,0,0,0,0,0.299256,-0.035162,0.062818,0.3795,0.029167,0.282182
1374,Anxi-2907,Unsureonlife,"Quit job due to mental health-centreing. Hi, I...","Hi smallwolf, I recently had an and diagnosis ...",0,0,1,1,0,0,0,0,0.453286,0.071143,0.018846,0.366578,-0.060682,0.14025
7754,PTSD-1193,Guest5643,I Am a little on edge today. Hi I Am a little ...,Hi monkeymagic and islandwiz When I said swat ...,0,0,0,1,1,0,1,0,0.120179,-0.02331,-0.121783,0.385143,0.263375,0.235


In [553]:
float((sample_mental_health_severity_table_1.iloc[:,12:] == sample_mental_health_severity_table_2.iloc[:,12:]).values.sum()/(len(sample_mental_health_severity_table_1.iloc[:,12:].columns)*len(sample_mental_health_severity_table_1)))

0.99

In [554]:
sample_mental_health_severity_table_1.iloc[:,12:] == sample_mental_health_severity_table_2.iloc[:,12:]

Unnamed: 0,Post_Valence,Post_Arousal,Post_Dominance,Comment_Valence,Comment_Arousal,Comment_Dominance
3969,True,True,True,True,True,True
6433,True,True,True,True,True,True
6358,True,True,True,True,True,True
221,True,True,True,True,True,True
8225,True,True,True,True,True,True
3934,True,True,True,True,True,True
7171,True,True,True,True,True,True
5741,True,True,True,True,True,True
1374,True,True,True,True,True,True
7754,True,True,True,True,True,True


## Debugging difference between `vad()` and `vad2()`

In [566]:
sample_text = mental_health_severity_table.iloc[5067]['Author_Last_Comment']
sample_text

'Thank you for your response, I really just needed someone to respond and let me know that I am heard, so thank you.'

In [569]:
scores_1 = vad(sample_text, polar_subset=False)
print(scores_1)

{'valence': 0.2265, 'arousal': -0.1199375, 'dominance': 0.1289375}


In [570]:
scores_2 = vad2(sample_text, polar_subset=False)
print(scores_2)

{'valence': 0.2265, 'arousal': -0.1199375, 'dominance': 0.1289375}


In [518]:
len(ngram_2)

113

In [519]:
ngram_1 == ngram_2

True

## Polar Subsets

In [183]:
arousal_polar_from_vad = vad_lexicon[(vad_lexicon['dominance']<=-0.333) | (vad_lexicon['dominance']>=0.333)].copy()
arousal_polar_from_vad

Unnamed: 0,term,valence,arousal,dominance
10,a goner,-0.230,-0.148,-0.354
11,a good,0.666,-0.164,0.388
15,a little,0.072,-0.380,-0.358
27,a sec,-0.310,-0.234,-0.366
29,a single,-0.104,-0.126,-0.394
...,...,...,...,...
54786,zookeeper,0.296,0.625,0.519
54789,zoologist,0.778,0.458,0.667
54795,zoophobia,-0.833,-0.667,-0.630
54797,zorro,0.625,0.667,0.792


In [173]:
arousal_polar = pd.read_csv('./data/vad/valence-polar-NRC-VAD-Lexicon-v2.1.txt', delimiter='\t', header=0)
pd.unique(arousal_polar['valence'])

array([ 1.   ,  0.996,  0.994,  0.991,  0.99 ,  0.986,  0.985,  0.981,
        0.979,  0.976,  0.971,  0.968,  0.966,  0.963,  0.962,  0.96 ,
        0.958,  0.952,  0.949,  0.948,  0.945,  0.944,  0.942,  0.94 ,
        0.939,  0.938,  0.936,  0.934,  0.933,  0.932,  0.928,  0.927,
        0.926,  0.922,  0.92 ,  0.918,  0.917,  0.916,  0.912,  0.91 ,
        0.908,  0.906,  0.905,  0.904,  0.902,  0.9  ,  0.899,  0.898,
        0.896,  0.894,  0.893,  0.892,  0.891,  0.89 ,  0.889,  0.888,
        0.886,  0.884,  0.88 ,  0.878,  0.876,  0.875,  0.874,  0.873,
        0.872,  0.87 ,  0.868,  0.867,  0.866,  0.864,  0.862,  0.86 ,
        0.858,  0.857,  0.854,  0.852,  0.85 ,  0.846,  0.844,  0.84 ,
        0.838,  0.836,  0.834,  0.833,  0.832,  0.83 ,  0.828,  0.826,
        0.824,  0.822,  0.82 ,  0.818,  0.816,  0.815,  0.814,  0.812,
        0.81 ,  0.809,  0.808,  0.806,  0.804,  0.802,  0.8  ,  0.798,
        0.796,  0.794,  0.792,  0.79 ,  0.788,  0.786,  0.784,  0.78 ,
      

In [167]:
set(arousal_polar['term']) - set(arousal_polar_from_vad['term'])

{'addressbook',
 'alarm clock',
 'babycarriage',
 'biggestfear',
 'cantbreathe',
 'doubleclick',
 'dryclean',
 'farmhouse',
 'fishinghook',
 'fishingpole',
 'fishtank',
 'freakingout',
 'fuckedoff',
 'fucksake',
 'fuckyeah',
 'growthefuckup',
 'hairdress',
 'hand grenade',
 'hangup',
 'happynewyear',
 'happyvalentinesday',
 'hateeveryone',
 'hateyou',
 'horsetrading',
 'ironingboard',
 'jet ski',
 'jump rope',
 'lazyday',
 'lifevest',
 'littlethings',
 'lovebite',
 'lovemylife',
 'luckygirl',
 'mailserver',
 'mindmap',
 'mixedemotions',
 nan,
 'newsweek',
 'nocomplaints',
 'nopoint',
 'nothappy',
 'nowork',
 'oldfashioned',
 'oneday',
 'palmtree',
 'peaceofmind',
 'pencilsharpener',
 'pieceofshit',
 'roadsweeper',
 'rock band',
 'rollingpin',
 'safetypin',
 'slot machine',
 'sohappy',
 'soslow',
 'spinningwheel',
 'stressfree',
 'stupidpeople',
 'subheader',
 'sweetwater',
 'taperecorder',
 'tearsofjoy',
 'thankyou',
 'thelittlethings',
 'tiredofit',
 'toomuchtodo',
 'traindriver',
 'u

## N-gram positions

In [177]:
name_grams_positions = list(ngrams(range(len(word_tokenize("How are you today?"))), 2))
name_grams_positions = [set(pos) for pos in name_grams_positions]
name_grams_positions

[{0, 1}, {1, 2}, {2, 3}, {3, 4}]