# Libraries

In [1]:
import pandas as pd
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from itertools import islice
from tqdm import tqdm
tqdm.pandas()

# Read Data

## Lexicon

In [2]:
vad_lexicon = pd.read_csv('./data/vad/NRC-VAD-Lexicon-v2.1-edited.txt', sep='\t', header=0, keep_default_na=False, na_values=[])

In [3]:
# filter vad_lexicon to only include rows where there are duplicate 'term' values
vad_lexicon_dup = vad_lexicon[vad_lexicon['term'].isin(vad_lexicon['term'][vad_lexicon['term'].duplicated()])]
vad_lexicon_dup


Unnamed: 0,term,valence,arousal,dominance


### 5-gram lexicons

In [4]:
# filter only 5-grams lexicons
five_grams_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 5].copy()

### 4-gram lexicons

In [5]:
# filter only 4-grams lexicons
four_grams_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 4].copy()

### 3-gram lexicons

In [6]:
# filter only 3-grams lexicons
three_grams_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 3].copy()

### 2-gram lexicons

In [7]:
# filter only 2-grams lexicons
two_grams_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 2].copy()

### 1-gram lexicons

In [8]:
# filter only 1-gram lexicons
one_gram_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 1].copy()

## Mental Health Severity Data

### Unbounded

In [9]:
mental_health_severity_table = pd.read_csv('./data/BeyondBlue/commented_post_authors_edited.csv', header=0)

In [10]:
mental_health_severity_table

Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,Valued Contributor,Days_Between
0,Anxi-6,Amenace,"Parental Anxiety. Hi Everyone, I am not quite ...",Thank you Indigo. I am seeing a counsellor ton...,0,0,0,1,0,0,0,0
1,Anxi-12,Whatsinaname,"Horrible week. Hi everyone, I am having a horr...",Since my last post I ended up quitting my job....,0,2,28,3,8,0,1,1765
2,Anxi-13,Sammy,New relationship anxiety. Dear adjust need a c...,Such a good idea this too shallpass thanks for...,0,0,0,0,2,0,0,161
3,Anxi-17,Olive83,Easy strategies for quick response. Recently m...,You are not wrong Caught between! And I did ex...,0,0,0,1,4,0,1,8
4,Anxi-28,jordan,"Vomiting, GAD, herd. Hello everyone, I have ha...",I am replying after a year. Thank you for your...,0,0,0,0,1,1,0,359
...,...,...,...,...,...,...,...,...,...,...,...,...
8806,Suic-1308,Apple2468,Confused looking for support. Hello Since a ch...,Thanks for sharing those links with me. Readin...,0,0,1,0,0,1,0,0
8807,Suic-1309,jujusbizarrecircus,Intrusive thoughts. For the past two years I h...,"Hey Joseph, I am actually making a homebred ca...",0,0,0,2,1,1,0,2
8808,Suic-1310,Teegs_,I do not know how to keep living. This has bee...,"Thank you for your replies, I really appreciat...",0,0,0,0,2,1,0,30
8809,Suic-1313,lizzie50,13 Reasons Why. I am not sure if anyone has se...,Thanks for the reply Mary! The people I work w...,0,0,3,3,3,0,1,23


### Bounded (14 days)

In [11]:
mental_health_severity_table_max14days = pd.read_csv('./data/BeyondBlue/commented_post_authors_max14days.csv', header=0)

In [12]:
mental_health_severity_table_max14days.head()

Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,Valued Contributor,Days_Between
0,Anxi-6,Amenace,"Parental Anxiety. Hi Everyone, I am not quite ...",Thank you Indigo. I am seeing a counsellor ton...,0,0,0,1,0,0,0,0
1,Anxi-12,Whatsinaname,"Horrible week. Hi everyone, I am having a horr...","Hi Petal, Thanks again for posting, your posit...",0,2,12,0,0,0,0,9
2,Anxi-13,Sammy,New relationship anxiety. Dear adjust need a c...,"Thanks ranges I told him, and he apologized an...",0,0,0,0,2,0,0,1
3,Anxi-17,Olive83,Easy strategies for quick response. Recently m...,You are not wrong Caught between! And I did ex...,0,0,0,1,4,0,1,8
4,Anxi-34,Gobble,Any idea?. I need a little help. On a generali...,She then spread rumors horrid things that coul...,0,0,0,2,0,0,0,2


# Main

## Check the Maximum N-gram in 'term'

In [14]:
# check the maximum n-gram in 'term'
max(vad_lexicon['term'].apply(lambda x: len(x.split(' '))))

5

## `vad()`

In [16]:
def vad(
        text, 
        five_grams_lexicon=five_grams_lexicon, 
        four_grams_lexicon=four_grams_lexicon, 
        three_grams_lexicon=three_grams_lexicon, 
        two_grams_lexicon=two_grams_lexicon, 
        one_gram_lexicon=one_gram_lexicon,
        polar_subset=False):
    """
    Calculate Valence, Arousal, and Dominance (VAD) scores for a given text using the NRC VAD Lexicon.

    Parameters:
    text (str): The input text to analyze.
    lexicon (pd.DataFrame): The VAD lexicon DataFrame with columns 'term', 'valence', 'arousal', 'dominance'.

    Returns:
    dict: A dictionary with average 'valence', 'arousal', and 'dominance' scores.
    """
    # Preprocess the text: lowercase and split into words
    words = word_tokenize(text.lower())
    
    # Initialize lists to store VAD scores
    valence_scores = []
    arousal_scores = []
    dominance_scores = []

    # initialize list to store matched n-grams and set to store their positions
    matched_ngrams = []
    matched_ngrams_arousal = []
    matched_ngrams_dominance = []
    matched_ngrams_valence = []
    matched_positions = set()
    matched_positions_arousal = set()
    matched_positions_dominance = set()
    matched_positions_valence = set()

    ## 5-grams
    # check for 5-grams of text in lexicon
    five_grams = list(ngrams(words, 5))
    five_grams = [' '.join(gram) for gram in five_grams]

    # record the positions of words that are part of matched 5-grams
    five_grams_positions = list(ngrams(range(len(words)), 5))
    five_grams_positions = [set(pos) for pos in five_grams_positions]

    if polar_subset:
        # filter lexicon to only polar subsets if polar_subset is True
        five_grams_lexicon_arousal = five_grams_lexicon[(five_grams_lexicon['arousal']<=-0.333) | (five_grams_lexicon['arousal']>=0.333)].copy()
        five_grams_lexicon_dominance = five_grams_lexicon[(five_grams_lexicon['dominance']<=-0.333) | (five_grams_lexicon['dominance']>=0.333)].copy()
        five_grams_lexicon_valence = five_grams_lexicon[(five_grams_lexicon['valence']<=-0.333) | (five_grams_lexicon['valence']>=0.333)].copy()
        # temporary sets to track matched positions for each dimension for this 5-gram when polar_subset is True
        matched_positions_arousal_5gram = set()
        matched_positions_dominance_5gram = set()
        matched_positions_valence_5gram = set()
    else:
        matched_positions_5gram = set()

    # check for matches in 5-grams
    for gram, pos in zip(five_grams, five_grams_positions):
        if polar_subset:
            match_arousal = five_grams_lexicon_arousal[five_grams_lexicon_arousal['term'] == gram]
            match_dominance = five_grams_lexicon_dominance[five_grams_lexicon_dominance['term'] == gram]
            match_valence = five_grams_lexicon_valence[five_grams_lexicon_valence['term'] == gram]
            if not match_valence.empty:
                valence_scores.append(float(match_valence['valence'].values[0]))
                matched_ngrams_valence.append(gram)
                matched_positions_valence_5gram.update(pos)
            if not match_arousal.empty:
                arousal_scores.append(float(match_arousal['arousal'].values[0]))
                matched_ngrams_arousal.append(gram)
                matched_positions_arousal_5gram.update(pos)
            if not match_dominance.empty:
                dominance_scores.append(float(match_dominance['dominance'].values[0]))
                matched_ngrams_dominance.append(gram)
                matched_positions_dominance_5gram.update(pos)
        else:
            match = five_grams_lexicon[five_grams_lexicon['term'] == gram]
            if not match.empty:
                valence_scores.append(float(match['valence'].values[0]))
                arousal_scores.append(float(match['arousal'].values[0]))
                dominance_scores.append(float(match['dominance'].values[0]))
                matched_ngrams.append(gram)
                matched_positions_5gram.update(pos)
    # update the main matched positions sets with the temporary sets for this 5-gram when polar_subset is True
    if polar_subset:
        matched_positions_valence.update(matched_positions_valence_5gram)
        matched_positions_arousal.update(matched_positions_arousal_5gram)
        matched_positions_dominance.update(matched_positions_dominance_5gram)
    else:
        matched_positions.update(matched_positions_5gram)

    ## 4-grams
    # check for 4-grams of text in lexicon
    four_grams = list(ngrams(words, 4))
    four_grams = [' '.join(gram) for gram in four_grams]

    # record the positions of words that are part of matched 4-grams
    four_grams_positions = list(ngrams(range(len(words)), 4))
    four_grams_positions = [set(pos) for pos in four_grams_positions]

    if polar_subset:
        # filter lexicon to only polar subsets if polar_subset is True
        four_grams_lexicon_arousal = four_grams_lexicon[(four_grams_lexicon['arousal']<=-0.333) | (four_grams_lexicon['arousal']>=0.333)].copy()
        four_grams_lexicon_dominance = four_grams_lexicon[(four_grams_lexicon['dominance']<=-0.333) | (four_grams_lexicon['dominance']>=0.333)].copy()
        four_grams_lexicon_valence = four_grams_lexicon[(four_grams_lexicon['valence']<=-0.333) | (four_grams_lexicon['valence']>=0.333)].copy()
        # temporary sets to track matched positions for each dimension for this 4-gram when polar_subset is True
        matched_positions_arousal_4gram = set()
        matched_positions_dominance_4gram = set()
        matched_positions_valence_4gram = set()
    else:
        matched_positions_4gram = set()

    # check for matches in 4-grams
    for gram, pos in zip(four_grams, four_grams_positions):
        if polar_subset:
            match_arousal = four_grams_lexicon_arousal[four_grams_lexicon_arousal['term'] == gram]
            match_dominance = four_grams_lexicon_dominance[four_grams_lexicon_dominance['term'] == gram]
            match_valence = four_grams_lexicon_valence[four_grams_lexicon_valence['term'] == gram]
            if not match_valence.empty:
                if pos in matched_positions_valence:
                    continue  # Skip if any word in the 4-gram is part of a matched n-gram
                valence_scores.append(float(match_valence['valence'].values[0]))
                matched_ngrams_valence.append(gram)
                matched_positions_valence_4gram.update(pos)
            if not match_arousal.empty:
                if pos in matched_positions_arousal:
                    continue  # Skip if any word in the 4-gram is part of a matched n-gram
                arousal_scores.append(float(match_arousal['arousal'].values[0]))
                matched_ngrams_arousal.append(gram)
                matched_positions_arousal_4gram.update(pos)
            if not match_dominance.empty:
                if pos in matched_positions_dominance:
                    continue  # Skip if any word in the 4-gram is part of a matched n-gram
                dominance_scores.append(float(match_dominance['dominance'].values[0]))
                matched_ngrams_dominance.append(gram)
                matched_positions_dominance_4gram.update(pos)
        else:
            if not matched_positions.isdisjoint(pos):
                continue  # Skip if any word in the 4-gram is part of a matched n-gram
            match = four_grams_lexicon[four_grams_lexicon['term'] == gram]
            if not match.empty:
                valence_scores.append(float(match['valence'].values[0]))
                arousal_scores.append(float(match['arousal'].values[0]))
                dominance_scores.append(float(match['dominance'].values[0]))
                matched_ngrams.append(gram)
                matched_positions_4gram.update(pos)
    # update the main matched positions sets with the temporary sets for this 4-gram when polar_subset is True
    if polar_subset:
        matched_positions_valence.update(matched_positions_valence_4gram)
        matched_positions_arousal.update(matched_positions_arousal_4gram)
        matched_positions_dominance.update(matched_positions_dominance_4gram)
    else:
        matched_positions.update(matched_positions_4gram)
    
    ## 3-grams
    # check for 3-grams of text in lexicon
    three_grams = list(ngrams(words, 3))
    three_grams = [' '.join(gram) for gram in three_grams]

    # record the positions of words that are part of matched 3-grams
    three_grams_positions = list(ngrams(range(len(words)), 3))
    three_grams_positions = [set(pos) for pos in three_grams_positions]

    if polar_subset:
        # filter lexicon to only polar subsets if polar_subset is True
        three_grams_lexicon_arousal = three_grams_lexicon[(three_grams_lexicon['arousal']<=-0.333) | (three_grams_lexicon['arousal']>=0.333)].copy()
        three_grams_lexicon_dominance = three_grams_lexicon[(three_grams_lexicon['dominance']<=-0.333) | (three_grams_lexicon['dominance']>=0.333)].copy()
        three_grams_lexicon_valence = three_grams_lexicon[(three_grams_lexicon['valence']<=-0.333) | (three_grams_lexicon['valence']>=0.333)].copy()
        # temporary sets to track matched positions for each dimension for this 3-gram when polar_subset is True
        matched_positions_arousal_3gram = set()
        matched_positions_dominance_3gram = set()
        matched_positions_valence_3gram = set()
    else:
        matched_positions_3gram = set()

    # check for matches in 3-grams
    for gram, pos in zip(three_grams, three_grams_positions):
        if polar_subset:
            match_arousal = three_grams_lexicon_arousal[three_grams_lexicon_arousal['term'] == gram]
            match_dominance = three_grams_lexicon_dominance[three_grams_lexicon_dominance['term'] == gram]
            match_valence = three_grams_lexicon_valence[three_grams_lexicon_valence['term'] == gram]
            if not match_valence.empty:
                if pos in matched_positions_valence:
                    continue  # Skip if any word in the 3-gram is part of a matched n-gram
                valence_scores.append(float(match_valence['valence'].values[0]))
                matched_ngrams_valence.append(gram)
                matched_positions_valence_3gram.update(pos)
            if not match_arousal.empty:
                if pos in matched_positions_arousal:
                    continue  # Skip if any word in the 3-gram is part of a matched n-gram
                arousal_scores.append(float(match_arousal['arousal'].values[0]))
                matched_ngrams_arousal.append(gram)
                matched_positions_arousal_3gram.update(pos)
            if not match_dominance.empty:
                if pos in matched_positions_dominance:
                    continue  # Skip if any word in the 3-gram is part of a matched n-gram
                dominance_scores.append(float(match_dominance['dominance'].values[0]))
                matched_ngrams_dominance.append(gram)
                matched_positions_dominance_3gram.update(pos)
        else:
            if not matched_positions.isdisjoint(pos):
                continue  # Skip if any word in the 3-gram is part of a matched n-gram
            match = three_grams_lexicon[three_grams_lexicon['term'] == gram]
            if not match.empty:
                valence_scores.append(float(match['valence'].values[0]))
                arousal_scores.append(float(match['arousal'].values[0]))
                dominance_scores.append(float(match['dominance'].values[0]))
                matched_ngrams.append(gram)
                matched_positions_3gram.update(pos)
    # update the main matched positions sets with the temporary sets for this 3-gram when polar_subset is True
    if polar_subset:
        matched_positions_valence.update(matched_positions_valence_3gram)
        matched_positions_arousal.update(matched_positions_arousal_3gram)
        matched_positions_dominance.update(matched_positions_dominance_3gram)
    else:
        matched_positions.update(matched_positions_3gram)
    
    ## 2-grams
    # check for 2-grams of text in lexicon, excluding words already part of matched 3-grams
    two_grams = list(ngrams(words, 2))
    two_grams = [' '.join(gram) for gram in two_grams]

    # record the positions of words that are part of matched 2-grams
    two_grams_positions = list(ngrams(range(len(words)), 2))
    two_grams_positions = [set(pos) for pos in two_grams_positions]

    if polar_subset:
        # filter lexicon to only polar subsets if polar_subset is True
        two_grams_lexicon_arousal = two_grams_lexicon[(two_grams_lexicon['arousal']<=-0.333) | (two_grams_lexicon['arousal']>=0.333)].copy()
        two_grams_lexicon_dominance = two_grams_lexicon[(two_grams_lexicon['dominance']<=-0.333) | (two_grams_lexicon['dominance']>=0.333)].copy()
        two_grams_lexicon_valence = two_grams_lexicon[(two_grams_lexicon['valence']<=-0.333) | (two_grams_lexicon['valence']>=0.333)].copy()
        # temporary sets to track matched positions for each dimension for this 2-gram when polar_subset is True
        matched_positions_arousal_2gram = set()
        matched_positions_dominance_2gram = set()
        matched_positions_valence_2gram = set()
    else:
        matched_positions_2gram = set()

    # check for matches in 2-grams
    for gram, pos in zip(two_grams, two_grams_positions):        
        if polar_subset:
            match_arousal = two_grams_lexicon_arousal[two_grams_lexicon_arousal['term'] == gram]
            match_dominance = two_grams_lexicon_dominance[two_grams_lexicon_dominance['term'] == gram]
            match_valence = two_grams_lexicon_valence[two_grams_lexicon_valence['term'] == gram]
            if not match_valence.empty:
                if pos in matched_positions_valence:
                    continue  # Skip if any word in the 2-gram is part of a matched n-gram
                valence_scores.append(float(match_valence['valence'].values[0]))
                matched_ngrams_valence.append(gram)
                matched_positions_valence_2gram.update(pos)
            if not match_arousal.empty:
                if pos in matched_positions_arousal:
                    continue  # Skip if any word in the 2-gram is part of a matched n-gram
                arousal_scores.append(float(match_arousal['arousal'].values[0]))
                matched_ngrams_arousal.append(gram)
                matched_positions_arousal_2gram.update(pos)
            if not match_dominance.empty:
                if pos in matched_positions_dominance:
                    continue  # Skip if any word in the 2-gram is part of a matched n-gram
                dominance_scores.append(float(match_dominance['dominance'].values[0]))
                matched_ngrams_dominance.append(gram)
                matched_positions_dominance_2gram.update(pos)
        else:
            if not matched_positions.isdisjoint(pos):
                continue  # Skip if any word in the 2-gram is part of a matched n-gram
            match = two_grams_lexicon[two_grams_lexicon['term'] == gram]
            if not match.empty:
                valence_scores.append(float(match['valence'].values[0]))
                arousal_scores.append(float(match['arousal'].values[0]))
                dominance_scores.append(float(match['dominance'].values[0]))
                matched_ngrams.append(gram)
                matched_positions_2gram.update(pos)
    # update the main matched positions sets with the temporary sets for this 2-gram when polar_subset is True
    if polar_subset:
        matched_positions_valence.update(matched_positions_valence_2gram)
        matched_positions_arousal.update(matched_positions_arousal_2gram)
        matched_positions_dominance.update(matched_positions_dominance_2gram)
    else:
        matched_positions.update(matched_positions_2gram)

    ## 1-grams
    if polar_subset:
        # filter lexicon to only polar subsets if polar_subset is True
        one_gram_lexicon_arousal = one_gram_lexicon[(one_gram_lexicon['arousal']<=-0.333) | (one_gram_lexicon['arousal']>=0.333)].copy()
        one_gram_lexicon_dominance = one_gram_lexicon[(one_gram_lexicon['dominance']<=-0.333) | (one_gram_lexicon['dominance']>=0.333)].copy()
        one_gram_lexicon_valence = one_gram_lexicon[(one_gram_lexicon['valence']<=-0.333) | (one_gram_lexicon['valence']>=0.333)].copy()
        # temporary sets to track matched positions for each dimension for this 1-gram when polar_subset is True
        matched_positions_arousal_1gram = set()
        matched_positions_dominance_1gram = set()
        matched_positions_valence_1gram = set()
    else:
        matched_positions_1gram = set()

    # check for unigrams of text in lexicon, excluding words already part of matched n-grams
    for i, word in enumerate(words):        
        if polar_subset:
            match_arousal = one_gram_lexicon_arousal[one_gram_lexicon_arousal['term'] == word]
            match_dominance = one_gram_lexicon_dominance[one_gram_lexicon_dominance['term'] == word]
            match_valence = one_gram_lexicon_valence[one_gram_lexicon_valence['term'] == word]
            if not match_valence.empty:
                if i in matched_positions_valence:
                    continue  # Skip if the word is part of a matched n-gram
                valence_scores.append(float(match_valence['valence'].values[0]))
                matched_ngrams_valence.append(word)
                matched_positions_valence.add(i)
            if not match_arousal.empty:
                if i in matched_positions_arousal:
                    continue  # Skip if the word is part of a matched n-gram
                arousal_scores.append(float(match_arousal['arousal'].values[0]))
                matched_ngrams_arousal.append(word)
                matched_positions_arousal.add(i)
            if not match_dominance.empty:
                if i in matched_positions_dominance:
                    continue  # Skip if the word is part of a matched n-gram
                dominance_scores.append(float(match_dominance['dominance'].values[0]))
                matched_ngrams_dominance.append(word)
                matched_positions_dominance.add(i)
        else:
            if i in matched_positions:
                continue  # Skip if the word is part of a matched n-gram
            match = one_gram_lexicon[one_gram_lexicon['term'] == word]
            if not match.empty:
                valence_scores.append(float(match['valence'].values[0]))
                arousal_scores.append(float(match['arousal'].values[0]))
                dominance_scores.append(float(match['dominance'].values[0]))
                matched_ngrams.append(word)
                matched_positions.add(i)
                
    # update the main matched positions sets with the temporary sets for this 1-gram when polar_subset is True
    if polar_subset:
        matched_positions_valence.update(matched_positions_valence_1gram)
        matched_positions_arousal.update(matched_positions_arousal_1gram)
        matched_positions_dominance.update(matched_positions_dominance_1gram)
    else:
        matched_positions.update(matched_positions_1gram)

    # Calculate average scores, return None if no scores found
    avg_valence = sum(valence_scores) / len(valence_scores) if valence_scores else None
    avg_arousal = sum(arousal_scores) / len(arousal_scores) if arousal_scores else None
    avg_dominance = sum(dominance_scores) / len(dominance_scores) if dominance_scores else None
    
    return {
        'valence': avg_valence,
        'arousal': avg_arousal,
        'dominance': avg_dominance
    }
    # return matched_ngrams_valence, valence_scores

## `vad2()`

a more efficient vad()

In [19]:
POLAR_THRESH = 0.333  # abs(value) >= threshold is considered polar

def _build_index(df):
    """
    Convert a lexicon DataFrame to a dict: term -> (valence, arousal, dominance).
    Assumes columns: ['term','valence','arousal','dominance'].
    """
    # normalize once to lowercase to match tokenization step
    # if your 'term' is already lowercase, this is still harmless
    return {str(t).lower(): (str(t).lower(), float(v), float(a), float(d))
            for t, v, a, d in zip(df['term'], df['valence'], df['arousal'], df['dominance'])}

def vad2(
    text,
    five_grams_lexicon=five_grams_lexicon,
    four_grams_lexicon=four_grams_lexicon,
    three_grams_lexicon=three_grams_lexicon,
    two_grams_lexicon=two_grams_lexicon,
    one_gram_lexicon=one_gram_lexicon,
    polar_subset=False,
    tokenizer=word_tokenize,
):
    """
    Compute average VAD with longest-match wins across 5→1 grams.
    If polar_subset=True, only include a dimension's score when that dimension is polar for the term.
    """
    # 1) Tokenize once
    words = tokenizer(text.lower())

    # 2) Build fast lookup dicts once (O(n) upfront, O(1) per lookup after)
    idx = {
        5: _build_index(five_grams_lexicon),
        4: _build_index(four_grams_lexicon),
        3: _build_index(three_grams_lexicon),
        2: _build_index(two_grams_lexicon),
        1: _build_index(one_gram_lexicon),
    }

    valence_scores, arousal_scores, dominance_scores = [], [], []
    matched_ngrams, matched_ngrams_valence, matched_ngrams_arousal, matched_ngrams_dominance = [], [], [], []
    if polar_subset:
        matched_positions_valence = set()
        matched_positions_arousal = set()
        matched_positions_dominance = set()
    else:
        matched_positions = set()

    # 3) Sweep n-grams from longest to shortest
    N = len(words)
    for n in (5, 4, 3, 2, 1):
        if N < n:
            continue
        lex = idx[n]
        if polar_subset:
            n_gram_matched_positions_valence = set()
            n_gram_matched_positions_arousal = set()
            n_gram_matched_positions_dominance = set()
        else:
            n_gram_matched_positions = set()
        # slide a window of length n
        for start in range(0, N - n + 1):
            span = range(start, start + n)
            # skip if any token already consumed by a longer match
            if (not polar_subset) and matched_positions.intersection(span):
                continue
            term = ' '.join(words[start:start + n])
            triplet = lex.get(term)
            if triplet is not None:
                t, v, a, d = triplet
                if not polar_subset:
                    valence_scores.append(v)
                    arousal_scores.append(a)
                    dominance_scores.append(d)
                    matched_ngrams.append(t)
                    n_gram_matched_positions.update(span)
                else:
                    if abs(v) >= POLAR_THRESH:
                        if matched_positions_valence.intersection(span):
                            continue  # Skip if any word in the n-gram is part of a matched n-gram
                        valence_scores.append(v)
                        matched_ngrams_valence.append(t)
                        n_gram_matched_positions_valence.update(span)
                    if abs(a) >= POLAR_THRESH:
                        if matched_positions_arousal.intersection(span):
                            continue  # Skip if any word in the n-gram is part of a matched n-gram
                        arousal_scores.append(a)
                        matched_ngrams_arousal.append(t)
                        n_gram_matched_positions_arousal.update(span)
                    if abs(d) >= POLAR_THRESH:
                        if matched_positions_dominance.intersection(span):
                            continue  # Skip if any word in the n-gram is part of a matched n-gram
                        dominance_scores.append(d)
                        matched_ngrams_dominance.append(t)
                        n_gram_matched_positions_dominance.update(span)
        # update matched positions
        if polar_subset:
            matched_positions_valence.update(n_gram_matched_positions_valence)
            matched_positions_arousal.update(n_gram_matched_positions_arousal)
            matched_positions_dominance.update(n_gram_matched_positions_dominance)
        else:
            matched_positions.update(n_gram_matched_positions)

    # 4) Averages (None if empty)
    avg_valence   = (sum(valence_scores) / len(valence_scores)) if valence_scores else None
    avg_arousal   = (sum(arousal_scores) / len(arousal_scores)) if arousal_scores else None
    avg_dominance = (sum(dominance_scores) / len(dominance_scores)) if dominance_scores else None

    return {
        'valence': avg_valence,
        'arousal': avg_arousal,
        'dominance': avg_dominance,
    }
    # return matched_ngrams_valence, valence_scores

## Add VAD Scores and Diffs to `commented_post_authors_edited` (`mental_health_severity_table_1`)

This adds the VAD scores and the difference between post and comment VAD to `commented_post_authors_edited.csv` data. This will then be saved to `mentalh_healt_severity_table_1.csv`

In [20]:
mental_health_severity_table_1 = mental_health_severity_table.copy()
# add VAD scores to mental_health_severity_table
mental_health_severity_table_1[['Post_Valence', 'Post_Arousal', 'Post_Dominance']] = mental_health_severity_table_1['Author_Post'].progress_apply(lambda x: pd.Series(vad2(x)))
mental_health_severity_table_1[['Comment_Valence', 'Comment_Arousal', 'Comment_Dominance']] = mental_health_severity_table_1['Author_Last_Comment'].progress_apply(lambda x: pd.Series(vad2(x)))
# add VAD scores diff between post and comment
mental_health_severity_table_1['Valence_Diff'] = mental_health_severity_table_1['Comment_Valence'] - mental_health_severity_table_1['Post_Valence']
mental_health_severity_table_1['Arousal_Diff'] = mental_health_severity_table_1['Comment_Arousal'] - mental_health_severity_table_1['Post_Arousal']
mental_health_severity_table_1['Dominance_Diff'] = mental_health_severity_table_1['Comment_Dominance'] - mental_health_severity_table_1['Post_Dominance']
mental_health_severity_table_1.head()

100%|██████████| 8811/8811 [31:49<00:00,  4.61it/s]  
100%|██████████| 8811/8811 [23:22<00:00,  6.28it/s] 


Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,...,Days_Between,Post_Valence,Post_Arousal,Post_Dominance,Comment_Valence,Comment_Arousal,Comment_Dominance,Valence_Diff,Arousal_Diff,Dominance_Diff
0,Anxi-6,Amenace,"Parental Anxiety. Hi Everyone, I am not quite ...",Thank you Indigo. I am seeing a counsellor ton...,0,0,0,1,0,0,...,0,0.1145,-0.0034,0.08986,0.241185,-0.050889,0.179852,0.126685,-0.047489,0.089992
1,Anxi-12,Whatsinaname,"Horrible week. Hi everyone, I am having a horr...",Since my last post I ended up quitting my job....,0,2,28,3,8,0,...,1765,0.08281,0.002881,-0.062214,0.039143,0.060333,0.016333,-0.043667,0.057452,0.078548
2,Anxi-13,Sammy,New relationship anxiety. Dear adjust need a c...,Such a good idea this too shallpass thanks for...,0,0,0,0,2,0,...,161,0.149356,-0.048927,0.044945,0.461118,-0.055529,0.184,0.311762,-0.006602,0.139055
3,Anxi-17,Olive83,Easy strategies for quick response. Recently m...,You are not wrong Caught between! And I did ex...,0,0,0,1,4,0,...,8,0.117859,0.005094,0.033594,0.104344,-0.036,0.051844,-0.013516,-0.041094,0.01825
4,Anxi-28,jordan,"Vomiting, GAD, herd. Hello everyone, I have ha...",I am replying after a year. Thank you for your...,0,0,0,0,1,1,...,359,0.06303,-0.038095,-0.001982,0.3275,-0.11675,0.189188,0.26447,-0.078655,0.19117


In [21]:
mental_health_severity_table_1.to_csv('./data/BeyondBlue/mental_health_severity_table_1.csv', index=False)

## Add VAD Scores and Diffs to `commented_post_authors_max14days` (`mental_health_severity_table_2`)

In [22]:
mental_health_severity_table_2 = mental_health_severity_table_max14days.copy()
# add VAD scores to mental_health_severity_table
mental_health_severity_table_2[['Post_Valence', 'Post_Arousal', 'Post_Dominance']] = mental_health_severity_table_2['Author_Post'].progress_apply(lambda x: pd.Series(vad2(x)))
mental_health_severity_table_2[['Comment_Valence', 'Comment_Arousal', 'Comment_Dominance']] = mental_health_severity_table_2['Author_Last_Comment'].progress_apply(lambda x: pd.Series(vad2(x)))
# add VAD scores diff between post and comment
mental_health_severity_table_2['Valence_Diff'] = mental_health_severity_table_2['Comment_Valence'] - mental_health_severity_table_2['Post_Valence']
mental_health_severity_table_2['Arousal_Diff'] = mental_health_severity_table_2['Comment_Arousal'] - mental_health_severity_table_2['Post_Arousal']
mental_health_severity_table_2['Dominance_Diff'] = mental_health_severity_table_2['Comment_Dominance'] - mental_health_severity_table_2['Post_Dominance']
mental_health_severity_table_2.head()

100%|██████████| 8387/8387 [13:37<00:00, 10.26it/s]
100%|██████████| 8387/8387 [08:12<00:00, 17.04it/s]


Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,...,Days_Between,Post_Valence,Post_Arousal,Post_Dominance,Comment_Valence,Comment_Arousal,Comment_Dominance,Valence_Diff,Arousal_Diff,Dominance_Diff
0,Anxi-6,Amenace,"Parental Anxiety. Hi Everyone, I am not quite ...",Thank you Indigo. I am seeing a counsellor ton...,0,0,0,1,0,0,...,0,0.1145,-0.0034,0.08986,0.241185,-0.050889,0.179852,0.126685,-0.047489,0.089992
1,Anxi-12,Whatsinaname,"Horrible week. Hi everyone, I am having a horr...","Hi Petal, Thanks again for posting, your posit...",0,2,12,0,0,0,...,9,0.08281,0.002881,-0.062214,0.201676,-0.067118,0.033882,0.118867,-0.069999,0.096097
2,Anxi-13,Sammy,New relationship anxiety. Dear adjust need a c...,"Thanks ranges I told him, and he apologized an...",0,0,0,0,2,0,...,1,0.149356,-0.048927,0.044945,0.127242,-0.080727,-0.036091,-0.022113,-0.0318,-0.081036
3,Anxi-17,Olive83,Easy strategies for quick response. Recently m...,You are not wrong Caught between! And I did ex...,0,0,0,1,4,0,...,8,0.117859,0.005094,0.033594,0.104344,-0.036,0.051844,-0.013516,-0.041094,0.01825
4,Anxi-34,Gobble,Any idea?. I need a little help. On a generali...,She then spread rumors horrid things that coul...,0,0,0,2,0,0,...,2,0.080487,-0.005872,0.044974,0.009374,-0.035979,-0.032937,-0.071113,-0.030107,-0.077911


In [23]:
mental_health_severity_table_2.to_csv('./data/BeyondBlue/mental_health_severity_table_2.csv', index=False)

## Add VAD Scores and Diffs to `commented_post_authors_edited` with only polar subsets of VAD lexicon (`mental_health_severity_table_3`)

In [24]:
mental_health_severity_table_3 = mental_health_severity_table.copy()
# add VAD scores to mental_health_severity_table
mental_health_severity_table_3[['Post_Valence', 'Post_Arousal', 'Post_Dominance']] = mental_health_severity_table_3['Author_Post'].progress_apply(lambda x: pd.Series(vad2(x, polar_subset=True)))
mental_health_severity_table_3[['Comment_Valence', 'Comment_Arousal', 'Comment_Dominance']] = mental_health_severity_table_3['Author_Last_Comment'].progress_apply(lambda x: pd.Series(vad2(x, polar_subset=True)))
# add VAD scores diff between post and comment
mental_health_severity_table_3['Valence_Diff'] = mental_health_severity_table_3['Comment_Valence'] - mental_health_severity_table_3['Post_Valence']
mental_health_severity_table_3['Arousal_Diff'] = mental_health_severity_table_3['Comment_Arousal'] - mental_health_severity_table_3['Post_Arousal']
mental_health_severity_table_3['Dominance_Diff'] = mental_health_severity_table_3['Comment_Dominance'] - mental_health_severity_table_3['Post_Dominance']
mental_health_severity_table_3

100%|██████████| 8811/8811 [08:35<00:00, 17.11it/s]
100%|██████████| 8811/8811 [08:34<00:00, 17.14it/s]


Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,...,Days_Between,Post_Valence,Post_Arousal,Post_Dominance,Comment_Valence,Comment_Arousal,Comment_Dominance,Valence_Diff,Arousal_Diff,Dominance_Diff
0,Anxi-6,Amenace,"Parental Anxiety. Hi Everyone, I am not quite ...",Thank you Indigo. I am seeing a counsellor ton...,0,0,0,1,0,0,...,0,0.227458,-0.024700,0.473538,0.517231,0.033250,0.392889,0.289772,0.057950,-0.080650
1,Anxi-12,Whatsinaname,"Horrible week. Hi everyone, I am having a horr...",Since my last post I ended up quitting my job....,0,2,28,3,8,0,...,1765,0.230077,0.091357,-0.100500,0.057167,0.798000,0.076714,-0.172910,0.706643,0.177214
2,Anxi-13,Sammy,New relationship anxiety. Dear adjust need a c...,Such a good idea this too shallpass thanks for...,0,0,0,0,2,0,...,161,0.307597,-0.081355,0.087162,0.779300,-0.248667,0.541600,0.471703,-0.167312,0.454438
3,Anxi-17,Olive83,Easy strategies for quick response. Recently m...,You are not wrong Caught between! And I did ex...,0,0,0,1,4,0,...,8,0.306125,0.052682,0.231867,0.197000,-0.075500,0.191333,-0.109125,-0.128182,-0.040533
4,Anxi-28,jordan,"Vomiting, GAD, herd. Hello everyone, I have ha...",I am replying after a year. Thank you for your...,0,0,0,0,1,1,...,359,0.100787,-0.113075,0.000639,0.663714,-0.480667,0.502000,0.562928,-0.367592,0.501361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8806,Suic-1308,Apple2468,Confused looking for support. Hello Since a ch...,Thanks for sharing those links with me. Readin...,0,0,1,0,0,1,...,0,0.187561,0.069371,0.133837,0.453923,-0.118000,-0.061333,0.266362,-0.187371,-0.195171
8807,Suic-1309,jujusbizarrecircus,Intrusive thoughts. For the past two years I h...,"Hey Joseph, I am actually making a homebred ca...",0,0,0,2,1,1,...,2,0.210208,0.085833,0.094030,0.426875,-0.016571,0.330000,0.216667,-0.102405,0.235970
8808,Suic-1310,Teegs_,I do not know how to keep living. This has bee...,"Thank you for your replies, I really appreciat...",0,0,0,0,2,1,...,30,0.244471,-0.162842,0.002255,0.288045,0.084300,0.240900,0.043574,0.247142,0.238645
8809,Suic-1313,lizzie50,13 Reasons Why. I am not sure if anyone has se...,Thanks for the reply Mary! The people I work w...,0,0,3,3,3,0,...,23,0.065969,0.126730,0.061049,0.252422,-0.032586,0.100229,0.186453,-0.159315,0.039180


In [25]:
mental_health_severity_table_3.to_csv('./data/BeyondBlue/mental_health_severity_table_3.csv', index=False)

## Add VAD Scores and Diffs to `commented_post_authors_max14days` with only polar subsets of VAD lexicon (`mental_health_severity_table_4`)

In [26]:
mental_health_severity_table_4 = mental_health_severity_table_max14days.copy()
# add VAD scores to mental_health_severity_table
mental_health_severity_table_4[['Post_Valence', 'Post_Arousal', 'Post_Dominance']] = mental_health_severity_table_4['Author_Post'].progress_apply(lambda x: pd.Series(vad2(x, polar_subset=True)))
mental_health_severity_table_4[['Comment_Valence', 'Comment_Arousal', 'Comment_Dominance']] = mental_health_severity_table_4['Author_Last_Comment'].progress_apply(lambda x: pd.Series(vad2(x, polar_subset=True)))
# add VAD scores diff between post and comment
mental_health_severity_table_4['Valence_Diff'] = mental_health_severity_table_4['Comment_Valence'] - mental_health_severity_table_4['Post_Valence']
mental_health_severity_table_4['Arousal_Diff'] = mental_health_severity_table_4['Comment_Arousal'] - mental_health_severity_table_4['Post_Arousal']
mental_health_severity_table_4['Dominance_Diff'] = mental_health_severity_table_4['Comment_Dominance'] - mental_health_severity_table_4['Post_Dominance']
mental_health_severity_table_4.head()

100%|██████████| 8387/8387 [09:04<00:00, 15.40it/s]
100%|██████████| 8387/8387 [08:02<00:00, 17.40it/s]


Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,...,Days_Between,Post_Valence,Post_Arousal,Post_Dominance,Comment_Valence,Comment_Arousal,Comment_Dominance,Valence_Diff,Arousal_Diff,Dominance_Diff
0,Anxi-6,Amenace,"Parental Anxiety. Hi Everyone, I am not quite ...",Thank you Indigo. I am seeing a counsellor ton...,0,0,0,1,0,0,...,0,0.227458,-0.0247,0.473538,0.517231,0.03325,0.392889,0.289772,0.05795,-0.08065
1,Anxi-12,Whatsinaname,"Horrible week. Hi everyone, I am having a horr...","Hi Petal, Thanks again for posting, your posit...",0,2,12,0,0,0,...,9,0.230077,0.091357,-0.1005,0.547909,-0.14,0.0146,0.317832,-0.231357,0.1151
2,Anxi-13,Sammy,New relationship anxiety. Dear adjust need a c...,"Thanks ranges I told him, and he apologized an...",0,0,0,0,2,0,...,1,0.307597,-0.081355,0.087162,0.4056,-0.133778,-0.101667,0.098003,-0.052423,-0.188829
3,Anxi-17,Olive83,Easy strategies for quick response. Recently m...,You are not wrong Caught between! And I did ex...,0,0,0,1,4,0,...,8,0.306125,0.052682,0.231867,0.197,-0.0755,0.191333,-0.109125,-0.128182,-0.040533
4,Anxi-34,Gobble,Any idea?. I need a little help. On a generali...,She then spread rumors horrid things that coul...,0,0,0,2,0,0,...,2,0.18175,0.210286,0.114167,0.017891,0.036492,-0.079042,-0.163859,-0.173793,-0.193208


In [27]:
mental_health_severity_table_4.to_csv('./data/BeyondBlue/mental_health_severity_table_4.csv', index=False)

# Testings

## Generate VAD Scores and Diffs

In [549]:
sample_mental_health_severity_table_1 = mental_health_severity_table.sample(50, random_state=42).copy()
# add VAD scores to mental_health_severity_table
sample_mental_health_severity_table_1[['Post_Valence', 'Post_Arousal', 'Post_Dominance']] = sample_mental_health_severity_table_1['Author_Post'].apply(lambda x: pd.Series(vad(x, polar_subset=True)))  
sample_mental_health_severity_table_1[['Comment_Valence', 'Comment_Arousal', 'Comment_Dominance']] = sample_mental_health_severity_table_1['Author_Last_Comment'].apply(lambda x: pd.Series(vad(x, polar_subset=True)))
# add VAD scores diff between post and comment
# sample_mental_health_severity_table['Valence_Diff'] = sample_mental_health_severity_table['Comment_Valence'] - sample_mental_health_severity_table['Post_Valence']
# sample_mental_health_severity_table['Arousal_Diff'] = sample_mental_health_severity_table['Comment_Arousal'] - sample_mental_health_severity_table['Post_Arousal']
# sample_mental_health_severity_table['Dominance_Diff'] = sample_mental_health_severity_table['Comment_Dominance'] - sample_mental_health_severity_table['Post_Dominance']
sample_mental_health_severity_table_1

Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,Valued Contributor,Days_Between,Post_Valence,Post_Arousal,Post_Dominance,Comment_Valence,Comment_Arousal,Comment_Dominance
3969,Depr-389,Richard,Teenage silence. Looking for advice for dealin...,Thankyou for your support and the fresh perspe...,0,0,0,0,0,0,1,7,0.421652,-0.186611,0.250824,0.514,-0.132545,0.198143
6433,Depr-5359,Dalro45,Something wrong with my head. I know that I su...,lot I just re-read what I wrote and it makes n...,0,0,0,0,0,0,0,0,0.341195,-0.112896,0.070968,0.619667,-0.6185,-0.0475
6358,Depr-5225,Arbutus,Struggling. Here I go-my first post. I have be...,"Thank you all for your kind replies, and I apo...",0,0,2,0,1,0,0,5,0.252878,-0.168132,0.121038,0.213008,-0.114395,0.075488
221,Anxi-515,Harriolo,Overwhelmed with tasks. Does anyone else spend...,Thanks for your response. Nice to see that som...,0,0,0,1,1,0,0,4,0.364224,-0.069143,0.151333,0.605833,-0.010571,0.215833
8225,PTSD-2040,OUT_OF_FIGHT,New to Beyond Blue. I am a 58yr old woman with...,"Hi Neil, Thank you so much for your reply, I b...",0,0,0,0,1,0,0,1,0.127354,-0.127839,0.043524,0.161,-0.197038,0.080339
3934,Depr-287,Supermum,Something needs to change. It is being a long ...,Thankyou to both of you for your replies and f...,0,0,0,0,1,0,2,4,0.136747,0.061923,0.11602,0.229152,-0.068206,0.224659
7171,PTSD-39,Rowen13,Unrequited friendships or low self-esteem?. Ho...,"Hi rx, I think the people we are drawn to, and...",0,0,0,0,4,0,21,26,0.339813,-0.090071,0.254957,0.311455,-0.094875,0.137371
5741,Depr-4187,OU812,"New Girlfriend depression, shuts down. Hi guys...",Thanks guys for the nice comments. I just feel...,0,1,1,0,0,0,0,0,0.299256,-0.035162,0.062818,0.3795,0.029167,0.282182
1374,Anxi-2907,Unsureonlife,"Quit job due to mental health-centreing. Hi, I...","Hi smallwolf, I recently had an and diagnosis ...",0,0,1,1,0,0,0,0,0.453286,0.071143,0.018846,0.366578,-0.060682,0.14025
7754,PTSD-1193,Guest5643,I Am a little on edge today. Hi I Am a little ...,Hi monkeymagic and islandwiz When I said swat ...,0,0,0,1,1,0,1,0,0.120179,-0.02331,-0.121783,0.385143,0.263375,0.235


In [571]:
sample_mental_health_severity_table_2 = mental_health_severity_table.sample(50, random_state=42).copy()
# add VAD scores to mental_health_severity_table
sample_mental_health_severity_table_2[['Post_Valence', 'Post_Arousal', 'Post_Dominance']] = sample_mental_health_severity_table_2['Author_Post'].apply(lambda x: pd.Series(vad2(x, polar_subset=True)))  
sample_mental_health_severity_table_2[['Comment_Valence', 'Comment_Arousal', 'Comment_Dominance']] = sample_mental_health_severity_table_2['Author_Last_Comment'].apply(lambda x: pd.Series(vad2(x, polar_subset=True)))
# add VAD scores diff between post and comment
# sample_mental_health_severity_table['Valence_Diff'] = sample_mental_health_severity_table['Comment_Valence'] - sample_mental_health_severity_table['Post_Valence']
# sample_mental_health_severity_table['Arousal_Diff'] = sample_mental_health_severity_table['Comment_Arousal'] - sample_mental_health_severity_table['Post_Arousal']
# sample_mental_health_severity_table['Dominance_Diff'] = sample_mental_health_severity_table['Comment_Dominance'] - sample_mental_health_severity_table['Post_Dominance']
sample_mental_health_severity_table_2.head()

Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,Valued Contributor,Days_Between,Post_Valence,Post_Arousal,Post_Dominance,Comment_Valence,Comment_Arousal,Comment_Dominance
3969,Depr-389,Richard,Teenage silence. Looking for advice for dealin...,Thankyou for your support and the fresh perspe...,0,0,0,0,0,0,1,7,0.421652,-0.186611,0.250824,0.514,-0.132545,0.198143
6433,Depr-5359,Dalro45,Something wrong with my head. I know that I su...,lot I just re-read what I wrote and it makes n...,0,0,0,0,0,0,0,0,0.341195,-0.112896,0.070968,0.619667,-0.6185,-0.0475
6358,Depr-5225,Arbutus,Struggling. Here I go-my first post. I have be...,"Thank you all for your kind replies, and I apo...",0,0,2,0,1,0,0,5,0.252878,-0.168132,0.121038,0.213008,-0.114395,0.075488
221,Anxi-515,Harriolo,Overwhelmed with tasks. Does anyone else spend...,Thanks for your response. Nice to see that som...,0,0,0,1,1,0,0,4,0.364224,-0.069143,0.151333,0.605833,-0.010571,0.215833
8225,PTSD-2040,OUT_OF_FIGHT,New to Beyond Blue. I am a 58yr old woman with...,"Hi Neil, Thank you so much for your reply, I b...",0,0,0,0,1,0,0,1,0.127354,-0.127839,0.043524,0.161,-0.197038,0.080339


In [553]:
float((sample_mental_health_severity_table_1.iloc[:,12:] == sample_mental_health_severity_table_2.iloc[:,12:]).values.sum()/(len(sample_mental_health_severity_table_1.iloc[:,12:].columns)*len(sample_mental_health_severity_table_1)))

0.99

In [554]:
sample_mental_health_severity_table_1.iloc[:,12:] == sample_mental_health_severity_table_2.iloc[:,12:]

Unnamed: 0,Post_Valence,Post_Arousal,Post_Dominance,Comment_Valence,Comment_Arousal,Comment_Dominance
3969,True,True,True,True,True,True
6433,True,True,True,True,True,True
6358,True,True,True,True,True,True
221,True,True,True,True,True,True
8225,True,True,True,True,True,True
3934,True,True,True,True,True,True
7171,True,True,True,True,True,True
5741,True,True,True,True,True,True
1374,True,True,True,True,True,True
7754,True,True,True,True,True,True


## Debugging difference between `vad()` and `vad2()`

In [566]:
sample_text = mental_health_severity_table.iloc[5067]['Author_Last_Comment']
sample_text

'Thank you for your response, I really just needed someone to respond and let me know that I am heard, so thank you.'

In [569]:
scores_1 = vad(sample_text, polar_subset=False)
print(scores_1)

{'valence': 0.2265, 'arousal': -0.1199375, 'dominance': 0.1289375}


In [570]:
scores_2 = vad2(sample_text, polar_subset=False)
print(scores_2)

{'valence': 0.2265, 'arousal': -0.1199375, 'dominance': 0.1289375}


## Polar Subsets

In [183]:
arousal_polar_from_vad = vad_lexicon[(vad_lexicon['dominance']<=-0.333) | (vad_lexicon['dominance']>=0.333)].copy()
arousal_polar_from_vad

Unnamed: 0,term,valence,arousal,dominance
10,a goner,-0.230,-0.148,-0.354
11,a good,0.666,-0.164,0.388
15,a little,0.072,-0.380,-0.358
27,a sec,-0.310,-0.234,-0.366
29,a single,-0.104,-0.126,-0.394
...,...,...,...,...
54786,zookeeper,0.296,0.625,0.519
54789,zoologist,0.778,0.458,0.667
54795,zoophobia,-0.833,-0.667,-0.630
54797,zorro,0.625,0.667,0.792


In [173]:
arousal_polar = pd.read_csv('./data/vad/valence-polar-NRC-VAD-Lexicon-v2.1.txt', delimiter='\t', header=0)
pd.unique(arousal_polar['valence'])

array([ 1.   ,  0.996,  0.994,  0.991,  0.99 ,  0.986,  0.985,  0.981,
        0.979,  0.976,  0.971,  0.968,  0.966,  0.963,  0.962,  0.96 ,
        0.958,  0.952,  0.949,  0.948,  0.945,  0.944,  0.942,  0.94 ,
        0.939,  0.938,  0.936,  0.934,  0.933,  0.932,  0.928,  0.927,
        0.926,  0.922,  0.92 ,  0.918,  0.917,  0.916,  0.912,  0.91 ,
        0.908,  0.906,  0.905,  0.904,  0.902,  0.9  ,  0.899,  0.898,
        0.896,  0.894,  0.893,  0.892,  0.891,  0.89 ,  0.889,  0.888,
        0.886,  0.884,  0.88 ,  0.878,  0.876,  0.875,  0.874,  0.873,
        0.872,  0.87 ,  0.868,  0.867,  0.866,  0.864,  0.862,  0.86 ,
        0.858,  0.857,  0.854,  0.852,  0.85 ,  0.846,  0.844,  0.84 ,
        0.838,  0.836,  0.834,  0.833,  0.832,  0.83 ,  0.828,  0.826,
        0.824,  0.822,  0.82 ,  0.818,  0.816,  0.815,  0.814,  0.812,
        0.81 ,  0.809,  0.808,  0.806,  0.804,  0.802,  0.8  ,  0.798,
        0.796,  0.794,  0.792,  0.79 ,  0.788,  0.786,  0.784,  0.78 ,
      

In [167]:
set(arousal_polar['term']) - set(arousal_polar_from_vad['term'])

{'addressbook',
 'alarm clock',
 'babycarriage',
 'biggestfear',
 'cantbreathe',
 'doubleclick',
 'dryclean',
 'farmhouse',
 'fishinghook',
 'fishingpole',
 'fishtank',
 'freakingout',
 'fuckedoff',
 'fucksake',
 'fuckyeah',
 'growthefuckup',
 'hairdress',
 'hand grenade',
 'hangup',
 'happynewyear',
 'happyvalentinesday',
 'hateeveryone',
 'hateyou',
 'horsetrading',
 'ironingboard',
 'jet ski',
 'jump rope',
 'lazyday',
 'lifevest',
 'littlethings',
 'lovebite',
 'lovemylife',
 'luckygirl',
 'mailserver',
 'mindmap',
 'mixedemotions',
 nan,
 'newsweek',
 'nocomplaints',
 'nopoint',
 'nothappy',
 'nowork',
 'oldfashioned',
 'oneday',
 'palmtree',
 'peaceofmind',
 'pencilsharpener',
 'pieceofshit',
 'roadsweeper',
 'rock band',
 'rollingpin',
 'safetypin',
 'slot machine',
 'sohappy',
 'soslow',
 'spinningwheel',
 'stressfree',
 'stupidpeople',
 'subheader',
 'sweetwater',
 'taperecorder',
 'tearsofjoy',
 'thankyou',
 'thelittlethings',
 'tiredofit',
 'toomuchtodo',
 'traindriver',
 'u

## N-gram positions

In [177]:
name_grams_positions = list(ngrams(range(len(word_tokenize("How are you today?"))), 2))
name_grams_positions = [set(pos) for pos in name_grams_positions]
name_grams_positions

[{0, 1}, {1, 2}, {2, 3}, {3, 4}]