# Libraries

In [23]:
import pandas as pd
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from tqdm import tqdm
tqdm.pandas()

# Read Data

## Lexicon

In [4]:
vad_lexicon = pd.read_csv('./data/vad/NRC-VAD-Lexicon-v2.1.txt', sep='\t', header=0, keep_default_na=False, na_values=[])

In [5]:
vad_lexicon

Unnamed: 0,term,valence,arousal,dominance
0,a battery,0.134,-0.298,-0.096
1,a bit,-0.096,-0.264,-0.214
2,a bunch,0.088,-0.350,-0.068
3,a cappella,0.134,-0.116,-0.200
4,a couple,0.266,-0.110,0.090
...,...,...,...,...
54797,zorro,0.625,0.667,0.792
54798,zucchini,0.020,-0.358,-0.500
54799,zulu,0.000,0.000,0.000
54800,zygote,0.278,0.333,-0.167


### 5-gram lexicons

In [6]:
# filter only 5-grams lexicons
five_grams_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 5].copy()

### 4-gram lexicons

In [7]:
# filter only 4-grams lexicons
four_grams_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 4].copy()

### 3-gram lexicons

In [8]:
# filter only 3-grams lexicons
three_grams_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 3].copy()

### 2-gram lexicons

In [9]:
# filter only 2-grams lexicons
two_grams_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 2].copy()

### 1-gram lexicons

In [10]:
# filter only 1-gram lexicons
one_gram_lexicon = vad_lexicon[vad_lexicon['term'].str.split().str.len() == 1].copy()

## Mental Health Severity Data

### Unbounded

In [11]:
mental_health_severity_table = pd.read_csv('./data/BeyondBlue/commented_post_authors_edited.csv', header=0)

In [12]:
mental_health_severity_table

Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,Valued Contributor,Days_Between
0,Anxi-6,Amenace,"Parental Anxiety. Hi Everyone, I am not quite ...",Thank you Indigo. I am seeing a counsellor ton...,0,0,0,1,0,0,0,0
1,Anxi-12,Whatsinaname,"Horrible week. Hi everyone, I am having a horr...",Since my last post I ended up quitting my job....,0,2,28,3,8,0,1,1765
2,Anxi-13,Sammy,New relationship anxiety. Dear adjust need a c...,Such a good idea this too shallpass thanks for...,0,0,0,0,2,0,0,161
3,Anxi-17,Olive83,Easy strategies for quick response. Recently m...,You are not wrong Caught between! And I did ex...,0,0,0,1,4,0,1,8
4,Anxi-28,jordan,"Vomiting, GAD, herd. Hello everyone, I have ha...",I am replying after a year. Thank you for your...,0,0,0,0,1,1,0,359
...,...,...,...,...,...,...,...,...,...,...,...,...
8806,Suic-1308,Apple2468,Confused looking for support. Hello Since a ch...,Thanks for sharing those links with me. Readin...,0,0,1,0,0,1,0,0
8807,Suic-1309,jujusbizarrecircus,Intrusive thoughts. For the past two years I h...,"Hey Joseph, I am actually making a homebred ca...",0,0,0,2,1,1,0,2
8808,Suic-1310,Teegs_,I do not know how to keep living. This has bee...,"Thank you for your replies, I really appreciat...",0,0,0,0,2,1,0,30
8809,Suic-1313,lizzie50,13 Reasons Why. I am not sure if anyone has se...,Thanks for the reply Mary! The people I work w...,0,0,3,3,3,0,1,23


### Bounded (14 days)

In [32]:
mental_health_severity_table_max14days = pd.read_csv('./data/BeyondBlue/commented_post_authors_max14days.csv', header=0)

In [33]:
mental_health_severity_table_max14days.head()

Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,Valued Contributor,Days_Between
0,Anxi-6,Amenace,"Parental Anxiety. Hi Everyone, I am not quite ...",Thank you Indigo. I am seeing a counsellor ton...,0,0,0,1,0,0,0,0
1,Anxi-12,Whatsinaname,"Horrible week. Hi everyone, I am having a horr...","Hi Petal, Thanks again for posting, your posit...",0,2,12,0,0,0,0,9
2,Anxi-13,Sammy,New relationship anxiety. Dear adjust need a c...,"Thanks ranges I told him, and he apologized an...",0,0,0,0,2,0,0,1
3,Anxi-17,Olive83,Easy strategies for quick response. Recently m...,You are not wrong Caught between! And I did ex...,0,0,0,1,4,0,1,8
4,Anxi-34,Gobble,Any idea?. I need a little help. On a generali...,She then spread rumors horrid things that coul...,0,0,0,2,0,0,0,2


# Main

## Check the Maximum N-gram in 'term'

In [13]:
# check the maximum n-gram in 'term'
max(vad_lexicon['term'].apply(lambda x: len(x.split(' '))))

5

## `vad()`

In [16]:
def vad(
        text, 
        five_grams_lexicon=five_grams_lexicon, 
        four_grams_lexicon=four_grams_lexicon, 
        three_grams_lexicon=three_grams_lexicon, 
        two_grams_lexicon=two_grams_lexicon, 
        one_gram_lexicon=one_gram_lexicon):
    """
    Calculate Valence, Arousal, and Dominance (VAD) scores for a given text using the NRC VAD Lexicon.

    Parameters:
    text (str): The input text to analyze.
    lexicon (pd.DataFrame): The VAD lexicon DataFrame with columns 'term', 'valence', 'arousal', 'dominance'.

    Returns:
    dict: A dictionary with average 'valence', 'arousal', and 'dominance' scores.
    """
    # Preprocess the text: lowercase and split into words
    words = word_tokenize(text.lower())
    
    # Initialize lists to store VAD scores
    valence_scores = []
    arousal_scores = []
    dominance_scores = []

    # initialize list to store matched n-grams and set to store their positions
    matched_ngrams = []
    matched_positions = set()

    # check for 5-grams of text in lexicon
    five_grams = list(ngrams(words, 5))
    five_grams = [' '.join(gram) for gram in five_grams]

    # record the positions of words that are part of matched 5-grams
    five_grams_positions = list(ngrams(range(len(words)), 5))
    five_grams_positions = [set(pos) for pos in five_grams_positions]
    for gram, pos in zip(five_grams, five_grams_positions):
        match = five_grams_lexicon[five_grams_lexicon['term'] == gram]
        if not match.empty:
            valence_scores.append(float(match['valence'].values[0]))
            arousal_scores.append(float(match['arousal'].values[0]))
            dominance_scores.append(float(match['dominance'].values[0]))
            matched_ngrams.append(gram)
            matched_positions = matched_positions.union(pos)

    # check for 4-grams of text in lexicon
    four_grams = list(ngrams(words, 4))
    four_grams = [' '.join(gram) for gram in four_grams]

    # record the positions of words that are part of matched 4-grams
    four_grams_positions = list(ngrams(range(len(words)), 4))
    four_grams_positions = [set(pos) for pos in four_grams_positions]

    for gram, pos in zip(four_grams, four_grams_positions):
        if not matched_positions.isdisjoint(pos):
            continue  # Skip if any word in the 4-gram is part of a matched n-gram
        match = four_grams_lexicon[four_grams_lexicon['term'] == gram]
        if not match.empty:
            valence_scores.append(float(match['valence'].values[0]))
            arousal_scores.append(float(match['arousal'].values[0]))
            dominance_scores.append(float(match['dominance'].values[0]))
            matched_ngrams.append(gram)
            matched_positions = matched_positions.union(pos)
    
    # check for 3-grams of text in lexicon
    three_grams = list(ngrams(words, 3))
    three_grams = [' '.join(gram) for gram in three_grams]

    # record the positions of words that are part of matched 3-grams
    three_grams_positions = list(ngrams(range(len(words)), 3))
    three_grams_positions = [set(pos) for pos in three_grams_positions]

    for gram, pos in zip(three_grams, three_grams_positions):
        if not matched_positions.isdisjoint(pos):
            continue  # Skip if any word in the 3-gram is part of a matched n-gram
        match = three_grams_lexicon[three_grams_lexicon['term'] == gram]
        if not match.empty:
            valence_scores.append(float(match['valence'].values[0]))
            arousal_scores.append(float(match['arousal'].values[0]))
            dominance_scores.append(float(match['dominance'].values[0]))
            matched_ngrams.append(gram)
            matched_positions = matched_positions.union(pos)
    
    # check for 2-grams of text in lexicon, excluding words already part of matched 3-grams
    two_grams = list(ngrams(words, 2))
    two_grams = [' '.join(gram) for gram in two_grams]

    # record the positions of words that are part of matched 2-grams
    two_grams_positions = list(ngrams(range(len(words)), 2))
    two_grams_positions = [set(pos) for pos in two_grams_positions]

    for gram, pos in zip(two_grams, two_grams_positions):
        if not matched_positions.isdisjoint(pos):
            continue  # Skip if any word in the 2-gram is part of a matched n-gram
        match = two_grams_lexicon[two_grams_lexicon['term'] == gram]
        if not match.empty:
            valence_scores.append(float(match['valence'].values[0]))
            arousal_scores.append(float(match['arousal'].values[0]))
            dominance_scores.append(float(match['dominance'].values[0]))
            matched_ngrams.append(gram)
            matched_positions = matched_positions.union(pos)

    # check for unigrams of text in lexicon, excluding words already part of matched n-grams
    for i, word in enumerate(words):
        if i in matched_positions:
            continue  # Skip if the word is part of a matched n-gram
        match = one_gram_lexicon[one_gram_lexicon['term'] == word]
        if not match.empty:
            valence_scores.append(float(match['valence'].values[0]))
            arousal_scores.append(float(match['arousal'].values[0]))
            dominance_scores.append(float(match['dominance'].values[0]))
            matched_ngrams.append(word)
            matched_positions.add(i)

    # Calculate average scores, return None if no scores found
    avg_valence = sum(valence_scores) / len(valence_scores) if valence_scores else None
    avg_arousal = sum(arousal_scores) / len(arousal_scores) if arousal_scores else None
    avg_dominance = sum(dominance_scores) / len(dominance_scores) if dominance_scores else None
    
    return {
        'valence': avg_valence,
        'arousal': avg_arousal,
        'dominance': avg_dominance
    }
    # return{
    #     'valence': valence_scores,
    #     'arousal': arousal_scores,
    #     'dominance': dominance_scores,
    #     'matched_ngrams': matched_ngrams
    # }

## Add VAD Scores and Diffs to `commented_post_authors_edited` (`mental_health_severity_v1`)

This adds the VAD scores and the difference between post and comment VAD to `commented_post_authors_edited.csv` data. This will then be saved to `mentalh_healt_severity_v1.csv`

In [25]:
# add VAD scores to mental_health_severity_table
mental_health_severity_table[['Post_Valence', 'Post_Arousal', 'Post_Dominance']] = mental_health_severity_table['Author_Post'].progress_apply(lambda x: pd.Series(vad(x)))
mental_health_severity_table[['Comment_Valence', 'Comment_Arousal', 'Comment_Dominance']] = mental_health_severity_table['Author_Last_Comment'].progress_apply(lambda x: pd.Series(vad(x)))
# add VAD scores diff between post and comment
mental_health_severity_table['Valence_Diff'] = mental_health_severity_table['Comment_Valence'] - mental_health_severity_table['Post_Valence']
mental_health_severity_table['Arousal_Diff'] = mental_health_severity_table['Comment_Arousal'] - mental_health_severity_table['Post_Arousal']
mental_health_severity_table['Dominance_Diff'] = mental_health_severity_table['Comment_Dominance'] - mental_health_severity_table['Post_Dominance']
mental_health_severity_table.head()

  0%|          | 0/8811 [00:00<?, ?it/s]

100%|██████████| 8811/8811 [2:47:18<00:00,  1.14s/it]  
100%|██████████| 8811/8811 [1:28:54<00:00,  1.65it/s]


Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,...,Days_Between,Post_Valence,Post_Arousal,Post_Dominance,Comment_Valence,Comment_Arousal,Comment_Dominance,Valence_Diff,Arousal_Diff,Dominance_Diff
0,Anxi-6,Amenace,"Parental Anxiety. Hi Everyone, I am not quite ...",Thank you Indigo. I am seeing a counsellor ton...,0,0,0,1,0,0,...,0,0.1145,-0.0034,0.08986,0.241185,-0.050889,0.179852,0.126685,-0.047489,0.089992
1,Anxi-12,Whatsinaname,"Horrible week. Hi everyone, I am having a horr...",Since my last post I ended up quitting my job....,0,2,28,3,8,0,...,1765,0.08281,0.002881,-0.062214,0.039143,0.060333,0.016333,-0.043667,0.057452,0.078548
2,Anxi-13,Sammy,New relationship anxiety. Dear adjust need a c...,Such a good idea this too shallpass thanks for...,0,0,0,0,2,0,...,161,0.149161,-0.049231,0.044842,0.448313,-0.04875,0.17125,0.299151,0.000481,0.126408
3,Anxi-17,Olive83,Easy strategies for quick response. Recently m...,You are not wrong Caught between! And I did ex...,0,0,0,1,4,0,...,8,0.117859,0.005094,0.033594,0.104344,-0.036,0.051844,-0.013516,-0.041094,0.01825
4,Anxi-28,jordan,"Vomiting, GAD, herd. Hello everyone, I have ha...",I am replying after a year. Thank you for your...,0,0,0,0,1,1,...,359,0.06303,-0.038095,-0.001982,0.3275,-0.11675,0.189188,0.26447,-0.078655,0.19117


In [26]:
mental_health_severity_table.to_csv('./data/BeyondBlue/mental_health_severity_v1.csv', index=False)

## Add VAD Scores and Diffs to `commented_post_authors_max14days` (`mental_health_severity_v2`)

In [34]:
mental_healt_severity_v2 = mental_health_severity_table_max14days.copy()
# add VAD scores to mental_health_severity_table
mental_healt_severity_v2[['Post_Valence', 'Post_Arousal', 'Post_Dominance']] = mental_healt_severity_v2['Author_Post'].progress_apply(lambda x: pd.Series(vad(x)))
mental_healt_severity_v2[['Comment_Valence', 'Comment_Arousal', 'Comment_Dominance']] = mental_healt_severity_v2['Author_Last_Comment'].progress_apply(lambda x: pd.Series(vad(x)))
# add VAD scores diff between post and comment
mental_healt_severity_v2['Valence_Diff'] = mental_healt_severity_v2['Comment_Valence'] - mental_healt_severity_v2['Post_Valence']
mental_healt_severity_v2['Arousal_Diff'] = mental_healt_severity_v2['Comment_Arousal'] - mental_healt_severity_v2['Post_Arousal']
mental_healt_severity_v2['Dominance_Diff'] = mental_healt_severity_v2['Comment_Dominance'] - mental_healt_severity_v2['Post_Dominance']
mental_healt_severity_v2.head()

100%|██████████| 8387/8387 [3:17:19<00:00,  1.41s/it]   
100%|██████████| 8387/8387 [1:12:24<00:00,  1.93it/s]


Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,...,Days_Between,Post_Valence,Post_Arousal,Post_Dominance,Comment_Valence,Comment_Arousal,Comment_Dominance,Valence_Diff,Arousal_Diff,Dominance_Diff
0,Anxi-6,Amenace,"Parental Anxiety. Hi Everyone, I am not quite ...",Thank you Indigo. I am seeing a counsellor ton...,0,0,0,1,0,0,...,0,0.1145,-0.0034,0.08986,0.241185,-0.050889,0.179852,0.126685,-0.047489,0.089992
1,Anxi-12,Whatsinaname,"Horrible week. Hi everyone, I am having a horr...","Hi Petal, Thanks again for posting, your posit...",0,2,12,0,0,0,...,9,0.08281,0.002881,-0.062214,0.207382,-0.059235,0.034176,0.124573,-0.062116,0.096391
2,Anxi-13,Sammy,New relationship anxiety. Dear adjust need a c...,"Thanks ranges I told him, and he apologized an...",0,0,0,0,2,0,...,1,0.149161,-0.049231,0.044842,0.129939,-0.065273,-0.015879,-0.019222,-0.016042,-0.060721
3,Anxi-17,Olive83,Easy strategies for quick response. Recently m...,You are not wrong Caught between! And I did ex...,0,0,0,1,4,0,...,8,0.117859,0.005094,0.033594,0.104344,-0.036,0.051844,-0.013516,-0.041094,0.01825
4,Anxi-34,Gobble,Any idea?. I need a little help. On a generali...,She then spread rumors horrid things that coul...,0,0,0,2,0,0,...,2,0.080487,-0.005872,0.044974,0.008595,-0.036451,-0.031768,-0.071892,-0.03058,-0.076742


In [35]:
mental_healt_severity_v2.to_csv('./data/BeyondBlue/mental_health_severity_v2.csv', index=False)

# Testings

## Generate VAD Scores and Diffs

In [149]:
sample_mental_health_severity_table = mental_health_severity_table.sample(10).copy()
# add VAD scores to mental_health_severity_table
sample_mental_health_severity_table[['Post_Valence', 'Post_Arousal', 'Post_Dominance']] = sample_mental_health_severity_table['Author_Post'].apply(lambda x: pd.Series(vad(x)))  
sample_mental_health_severity_table[['Comment_Valence', 'Comment_Arousal', 'Comment_Dominance']] = sample_mental_health_severity_table['Author_Last_Comment'].apply(lambda x: pd.Series(vad(x)))
# add VAD scores diff between post and comment
sample_mental_health_severity_table['Valence_Diff'] = sample_mental_health_severity_table['Comment_Valence'] - sample_mental_health_severity_table['Post_Valence']
sample_mental_health_severity_table['Arousal_Diff'] = sample_mental_health_severity_table['Comment_Arousal'] - sample_mental_health_severity_table['Post_Arousal']
sample_mental_health_severity_table['Dominance_Diff'] = sample_mental_health_severity_table['Comment_Dominance'] - sample_mental_health_severity_table['Post_Dominance']
sample_mental_health_severity_table

Unnamed: 0,Post_ID,Author,Author_Post,Author_Last_Comment,Beyond Blue Staff,Blue Voices Member,Champion Alumni,Community Champion,Community Member,Moderator,...,Days_Between,Post_Valence,Post_Arousal,Post_Dominance,Comment_Valence,Comment_Arousal,Comment_Dominance,Valence_Diff,Arousal_Diff,Dominance_Diff
2043,Anxi-4232,greatoutdoors123,"Workplace Anxiety. Hi everyone, This is my fir...","Thank you rundown, your message is very helpful!",0,0,0,0,1,0,...,0,0.092418,-0.024397,0.033527,0.268714,-0.182,0.139429,0.176296,-0.157603,0.105901
4866,Depr-2391,Mickii,Physical versus mental illness. Do other peopl...,"Mainly fatigue, dizziness and dissociation. I ...",0,1,0,2,2,0,...,1,-0.118462,-0.033385,-0.163692,0.049407,0.006074,0.034259,0.167869,0.039459,0.197952
3109,Anxi-6170,Amberlourob,"Fighting hard but tired. Hello, I am new here ...",Thank you so much for your reply James! I real...,0,0,0,1,0,0,...,0,0.092138,0.012644,0.05581,0.19172,-0.068893,0.091747,0.099582,-0.081537,0.035936
4646,Depr-1953,penguin7676,I think I have depression.... Hi. I am kind of...,"Hi, I went to my go this afternoon. I have dep...",0,1,2,0,2,1,...,2,0.055386,-0.039852,-0.02109,0.150347,-0.096571,0.00298,0.094961,-0.056719,0.02407
4617,Depr-1891,User018263,Hopelessness and Lack of Meaning. Hi all if an...,"Hi random, thanks for the post. Yep sounds a b...",0,0,1,0,3,0,...,4,0.136391,-0.101238,0.028596,0.182512,-0.024767,0.017558,0.046121,0.076471,-0.011038
8445,Suic-517,Angie2480,"When Life Seems Too Hard. HelloWow, I cannot b...","Thanks, Sophie. I am not sure that I am ok. I ...",0,0,0,0,0,1,...,0,0.138092,-0.059218,0.06123,0.036429,-0.121143,-0.066429,-0.101663,-0.061924,-0.127658
6826,Depr-5960,Kirlei02,"its with depression: after a lifetime, it all ...","Thanks, Neil, I thank everyone for their feedb...",0,1,1,0,4,0,...,35,0.116731,-0.023434,0.026549,0.135173,-0.083983,0.034804,0.018442,-0.060549,0.008256
3754,Anxi-7288,KatieG,Anxiety has turned my life upside down. I rece...,"Oh, no I am an arsenal fan for life! I am curr...",0,0,0,0,2,0,...,5,0.05219,-0.029529,0.0145,0.179727,-0.055697,0.055333,0.127538,-0.026168,0.040833
3468,Anxi-6801,Tubbypuff,Anxiety flare-ups at night. I got random anxie...,"Hi SM, thank you for the ideas! Last night sta...",0,0,0,0,2,0,...,1,0.06873,-0.02055,-0.00101,0.141164,-0.142104,-0.007149,0.072434,-0.121554,-0.006139
1121,Anxi-2373,Ikvic,Next step a hurdle. I have fluctuating levels ...,Hi Tim%2c%3cbr/%3eNo I don%27t have to tell my...,0,0,0,1,1,0,...,1,0.062716,-0.037706,0.024451,0.103385,-0.020339,0.065138,0.04067,0.017366,0.040687
