# Libraries

In [45]:
import pandas as pd
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

# Read Data

## Lexicon

In [58]:
vad_lexicon = pd.read_csv('./data/vad/NRC-VAD-Lexicon-v2.1.txt', sep='\t', header=0, keep_default_na=False, na_values=[])

In [59]:
vad_lexicon

Unnamed: 0,term,valence,arousal,dominance
0,a battery,0.134,-0.298,-0.096
1,a bit,-0.096,-0.264,-0.214
2,a bunch,0.088,-0.350,-0.068
3,a cappella,0.134,-0.116,-0.200
4,a couple,0.266,-0.110,0.090
...,...,...,...,...
54797,zorro,0.625,0.667,0.792
54798,zucchini,0.020,-0.358,-0.500
54799,zulu,0.000,0.000,0.000
54800,zygote,0.278,0.333,-0.167


### 5-gram lexicons

In [115]:
# filter only 5-grams lexicons
five_grams_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 5].copy()

### 4-gram lexicons

In [117]:
# filter only 4-grams lexicons
four_grams_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 4].copy()

### 3-gram lexicons

In [119]:
# filter only 3-grams lexicons
three_grams_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 3].copy()

### 2-gram lexicons

In [124]:
# filter only 2-grams lexicons
two_grams_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 2].copy()

### 1-gram lexicons

In [129]:
# filter only 1-gram lexicons
one_gram_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 1].copy()

## Mental Health Severity Data

In [24]:
mental_health_severity_table = pd.read_csv('./data/BeyondBlue/commented_post_authors_edited.csv', header=0)

In [25]:
mental_health_severity_table

Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,Valued Contributor,Days_Between
0,Anxi-6,Amenace,"Parental Anxiety. Hi Everyone, I am not quite ...",Thank you Indigo. I am seeing a counsellor ton...,0,0,0,1,0,0,0,0
1,Anxi-12,Whatsinaname,"Horrible week. Hi everyone, I am having a horr...",Since my last post I ended up quitting my job....,0,2,28,3,8,0,1,1765
2,Anxi-13,Sammy,New relationship anxiety. Dear adjust need a c...,Such a good idea this too shallpass thanks for...,0,0,0,0,2,0,0,161
3,Anxi-17,Olive83,Easy strategies for quick response. Recently m...,You are not wrong Caught between! And I did ex...,0,0,0,1,4,0,1,8
4,Anxi-28,jordan,"Vomiting, GAD, herd. Hello everyone, I have ha...",I am replying after a year. Thank you for your...,0,0,0,0,1,1,0,359
...,...,...,...,...,...,...,...,...,...,...,...,...
8806,Suic-1308,Apple2468,Confused looking for support. Hello Since a ch...,Thanks for sharing those links with me. Readin...,0,0,1,0,0,1,0,0
8807,Suic-1309,jujusbizarrecircus,Intrusive thoughts. For the past two years I h...,"Hey Joseph, I am actually making a homebred ca...",0,0,0,2,1,1,0,2
8808,Suic-1310,Teegs_,I do not know how to keep living. This has bee...,"Thank you for your replies, I really appreciat...",0,0,0,0,2,1,0,30
8809,Suic-1313,lizzie50,13 Reasons Why. I am not sure if anyone has se...,Thanks for the reply Mary! The people I work w...,0,0,3,3,3,0,1,23


# Main

## Check the Maximum N-gram in 'term'

In [62]:
# check the maximum n-gram in 'term'
max(vad_lexicon['term'].apply(lambda x: len(x.split(' '))))

5

## `vad()`

In [131]:
def vad(
        text, 
        five_grams_lexicon=five_grams_lexicon, 
        four_grams_lexicon=four_grams_lexicon, 
        three_grams_lexicon=three_grams_lexicon, 
        two_grams_lexicon=two_grams_lexicon, 
        one_gram_lexicon=one_gram_lexicon):
    """
    Calculate Valence, Arousal, and Dominance (VAD) scores for a given text using the NRC VAD Lexicon.

    Parameters:
    text (str): The input text to analyze.
    lexicon (pd.DataFrame): The VAD lexicon DataFrame with columns 'term', 'valence', 'arousal', 'dominance'.

    Returns:
    dict: A dictionary with average 'valence', 'arousal', and 'dominance' scores.
    """
    # Preprocess the text: lowercase and split into words
    words = word_tokenize(text.lower())
    
    # Initialize lists to store VAD scores
    valence_scores = []
    arousal_scores = []
    dominance_scores = []

    # initialize list to store matched n-grams and set to store their positions
    matched_ngrams = []
    matched_positions = set()

    # check for 5-grams of text in lexicon
    five_grams = list(ngrams(words, 5))
    five_grams = [' '.join(gram) for gram in five_grams]

    # record the positions of words that are part of matched 5-grams
    five_grams_positions = list(ngrams(range(len(words)), 5))
    five_grams_positions = [set(pos) for pos in five_grams_positions]
    for gram, pos in zip(five_grams, five_grams_positions):
        match = five_grams_lexicon[five_grams_lexicon['term'] == gram]
        if not match.empty:
            valence_scores.append(float(match['valence'].values[0]))
            arousal_scores.append(float(match['arousal'].values[0]))
            dominance_scores.append(float(match['dominance'].values[0]))
            matched_ngrams.append(gram)
            matched_positions = matched_positions.union(pos)

    # check for 4-grams of text in lexicon
    four_grams = list(ngrams(words, 4))
    four_grams = [' '.join(gram) for gram in four_grams]

    # record the positions of words that are part of matched 4-grams
    four_grams_positions = list(ngrams(range(len(words)), 4))
    four_grams_positions = [set(pos) for pos in four_grams_positions]

    for gram, pos in zip(four_grams, four_grams_positions):
        if not matched_positions.isdisjoint(pos):
            continue  # Skip if any word in the 4-gram is part of a matched n-gram
        match = four_grams_lexicon[four_grams_lexicon['term'] == gram]
        if not match.empty:
            valence_scores.append(float(match['valence'].values[0]))
            arousal_scores.append(float(match['arousal'].values[0]))
            dominance_scores.append(float(match['dominance'].values[0]))
            matched_ngrams.append(gram)
            matched_positions = matched_positions.union(pos)
    
    # check for 3-grams of text in lexicon
    three_grams = list(ngrams(words, 3))
    three_grams = [' '.join(gram) for gram in three_grams]

    # record the positions of words that are part of matched 3-grams
    three_grams_positions = list(ngrams(range(len(words)), 3))
    three_grams_positions = [set(pos) for pos in three_grams_positions]

    for gram, pos in zip(three_grams, three_grams_positions):
        if not matched_positions.isdisjoint(pos):
            continue  # Skip if any word in the 3-gram is part of a matched n-gram
        match = three_grams_lexicon[three_grams_lexicon['term'] == gram]
        if not match.empty:
            valence_scores.append(float(match['valence'].values[0]))
            arousal_scores.append(float(match['arousal'].values[0]))
            dominance_scores.append(float(match['dominance'].values[0]))
            matched_ngrams.append(gram)
            matched_positions = matched_positions.union(pos)
    
    # check for 2-grams of text in lexicon, excluding words already part of matched 3-grams
    two_grams = list(ngrams(words, 2))
    two_grams = [' '.join(gram) for gram in two_grams]

    # record the positions of words that are part of matched 2-grams
    two_grams_positions = list(ngrams(range(len(words)), 2))
    two_grams_positions = [set(pos) for pos in two_grams_positions]

    for gram, pos in zip(two_grams, two_grams_positions):
        if not matched_positions.isdisjoint(pos):
            continue  # Skip if any word in the 2-gram is part of a matched n-gram
        match = two_grams_lexicon[two_grams_lexicon['term'] == gram]
        if not match.empty:
            valence_scores.append(float(match['valence'].values[0]))
            arousal_scores.append(float(match['arousal'].values[0]))
            dominance_scores.append(float(match['dominance'].values[0]))
            matched_ngrams.append(gram)
            matched_positions = matched_positions.union(pos)

    # check for unigrams of text in lexicon, excluding words already part of matched n-grams
    for i, word in enumerate(words):
        if i in matched_positions:
            continue  # Skip if the word is part of a matched n-gram
        match = one_gram_lexicon[one_gram_lexicon['term'] == word]
        if not match.empty:
            valence_scores.append(float(match['valence'].values[0]))
            arousal_scores.append(float(match['arousal'].values[0]))
            dominance_scores.append(float(match['dominance'].values[0]))
            matched_ngrams.append(word)
            matched_positions.add(i)

    # Calculate average scores, return None if no scores found
    avg_valence = sum(valence_scores) / len(valence_scores) if valence_scores else None
    avg_arousal = sum(arousal_scores) / len(arousal_scores) if arousal_scores else None
    avg_dominance = sum(dominance_scores) / len(dominance_scores) if dominance_scores else None
    
    # return {
    #     'valence': avg_valence,
    #     'arousal': avg_arousal,
    #     'dominance': avg_dominance
    # }{
    return{
        'valence': valence_scores,
        'arousal': arousal_scores,
        'dominance': dominance_scores,
        'matched_ngrams': matched_ngrams
    }

In [64]:
text = mental_health_severity_table["Author_Post"].iloc[0]
text

"Parental Anxiety. Hi Everyone, I am not quite sure how to put this in words so will just type and she what comes out. I feel so much anxiety about my teenager and the choices she will make in her life. I have to allow her to make some mistakes and learn and grow but the anxiety this gives me is quite literally strangling me. How do I stop over analyzing everything a thousand times over in my brain. It's crippling me."

In [109]:
len(word_tokenize(text))

90

In [133]:
vad(text)["matched_ngrams"]

['not quite',
 'how to',
 'so much',
 'have to',
 'stop over',
 'thousand times',
 'parental',
 'anxiety',
 'hi',
 'everyone',
 'am',
 'sure',
 'put',
 'this',
 'words',
 'will',
 'just',
 'type',
 'and',
 'what',
 'out',
 'feel',
 'anxiety',
 'about',
 'teenager',
 'and',
 'will',
 'make',
 'life',
 'allow',
 'to',
 'make',
 'some',
 'and',
 'learn',
 'and',
 'grow',
 'but',
 'anxiety',
 'this',
 'is',
 'quite',
 'literally',
 'how',
 'do',
 'everything',
 'over',
 'brain',
 'it',
 'crippling']

In [108]:
len(vad(text)["valence"])

50