## Semantic Similarity

This notebook walks through the steps to compute word embeddings for each comment in the Reddit dataset. Continuing from the previous notebook, we consider weekly-level data from the months of May and June across the selected subreddits to compute the word embeddings. We utilize an sentence-BERT model using which we compute embeddings and then generate a Semantic Similarity Measure.

----

Install libraries

In [1]:
pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-16.0.0-cp38-cp38-manylinux_2_28_x86_64.whl (40.8 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/40.8 MB[0m [31m52.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/40.8 MB[0m [31m51.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/40.8 MB[0m [31m54.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/40.8 MB[0m [31m61.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/40.8 MB[0m [31m80.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/40.8 MB[0m 

In [2]:
pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m84.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nltk
Successfully installed nltk-3.8.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
#Download the SBERT model to deploy on the 'body' and the 'receiver_body' columns, so that the word embeddings can be obtained.
!pip install -U sentence-transformers
# load tqdm
!pip install --force https://github.com/chengs/tqdm/archive/colab.zip
!pip install swifter

Collecting tqdm (from sentence-transformers)
  Downloading tqdm-4.66.4-py3-none-any.whl (78 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/78.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tqdm
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.28.1
    Uninstalling tqdm-4.28.1:
      Successfully uninstalled tqdm-4.28.1
Successfully installed tqdm-4.66.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting https://github.com/chengs/tqdm/archive/colab.zip
  Using cached https://github.com/chengs/tqdm/archive/colab.zip
  Preparing metadata (setup.py) ... [?25l- done
[?25hBuil

In [5]:
import numpy as np
import pandas as pd
import nltk
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, util
from torch.utils.data import DataLoader
from sentence_transformers import losses
import os
import swifter
from nltk.tokenize import sent_tokenize
import torch
nltk.download('punkt')
from tqdm import tqdm
tqdm.pandas()
import pickle

[nltk_data] Downloading package punkt to /home/datalore/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('all-mpnet-base-v2')

We then define functions to generate the word embeddings at the comment level for each week of data. 

In [None]:
def generate_language_similarity(val1, val2):
    return util.dot_score(val1, val2)

#define function to generate embeddings for one row
def generate_embeddings(record):
   return sbert_model.encode(record, convert_to_numpy=True, normalize_embeddings=True)

def divide_data_3_parts(week_data):
    """
    Divide the data of a given week into three parts
    Parameters
    ----------
    week_data : dataframe
        the data for a given week
    Returns
    ----------
    part1, part2, part3: tuple of dataframes
         the data for each part
    """
    week_data.head(3)
    div, mod = divmod(len(week_data), 3)
    part1 = week_data[:div]
    part2 = week_data[div:div * 2]
    part3 = week_data[div * 2:]
    if mod == 1:
        part3 = pd.concat([part3, part2.iloc[-1:]], ignore_index=True)
        part2 = part2.iloc[:-1]
    elif mod == 2:
        part3 = pd.concat([part3, part2.iloc[-2:]], ignore_index=True)
        part2 = part2.iloc[:-2]
    return part1, part2, part3

def word_embeddings(week_no,part_data,part_no):
    """
    Compute the word embeddings for each author's and receiver's comment and then compute the semantic similarity
    Parameters
    ----------
    week_no : int
        the week's number (from 1 to 26)
    part_data : dataframe
        the subset of the reddit data for the given week
    part_no : int
        the part number (1, 2, or 3)
    Returns
    ----------
        None
    """
    #call the function to generate word embeddings
    print('start generating the word embeddings')
    body_embeddings = part_data['body'].progress_apply(generate_embeddings)
    print('finished generation of body embeddings')
    print(len(body_embeddings))
    receiver_body_embeddings = part_data['receiver_body'].progress_apply(generate_embeddings)
    print('finished generation of receiver embeddings')
    print(len(receiver_body_embeddings))
    print('finished generation of word embeddings')
    model_storage_name_location = 'word_embeddings/embeddings_w'+ str(week_no) +'_part'+str(part_no)+ '.pkl'
    # Open the file in binary write mode
    with open(model_storage_name_location, 'wb') as f:
        pickle.dump(body_embeddings, f)
    print(type(body_embeddings))
    model_storage_name_location_receiver = 'word_embeddings/receiver_embeddings_w'+ str(week_no) +'_part'+str(part_no)+ '.pkl'
    # Open the file in binary write mode
    with open(model_storage_name_location_receiver, 'wb') as f:
        pickle.dump(receiver_body_embeddings, f)
    print(type(receiver_body_embeddings))
    #compute the languageSimilarity
    df1 = pd.concat([body_embeddings, receiver_body_embeddings], axis=1)
    print(len(df1))
    df1['languageSimilarity_commentLevel'] = df1.progress_apply(lambda x: (util.dot_score(x[0],x[1])).item(), axis=1)
    extracted_col = df1['languageSimilarity_commentLevel']
    print(len(extracted_col))
    part_data_final = pd.concat([part_data, extracted_col], axis=1)
    print(len(part_data_final))
    #part_data_final.head(3)
    resulting_file_name = 'bothSim_Processed/mayjune_w'+str(week_no)+'_part'+str(part_no)+ '_processed.parquet'
    part_data_final.to_parquet(resulting_file_name)
    print('completed!')

In [7]:
##############################
#read the embeddings
# week = 1
# part = 3
# body_embeddings_1 = pd.read_pickle('JanFebSubreddits2022/word_embeddings/embeddings_w'+str(week)+'_part'+str(part)+'.pkl')
# print(len(body_embeddings_1))
# print(body_embeddings_1[0])
# print(type(body_embeddings_1))
# receiver_embeddings_1 = pd.read_pickle('JanFebSubreddits2022/word_embeddings/receiver_embeddings_w'+str(week)+'_part'+str(part)+'.pkl')
# print(len(receiver_embeddings_1))
# print(receiver_embeddings_1[0])
# print(type(receiver_embeddings_1))

In [8]:
#############################
#read the processed file with both network similarity and language similarity
# week = 1
# part = 3
# file_path1 = 'JanFebSubreddits2022/bothSim_Processed/janfeb_w' + str(week) + '_part'+str(part)+'_processed.parquet'
# week_data2 = pd.read_parquet(file_path1)
# print(week_data2['subreddit'].value_counts())
# print(week_data2['date'].value_counts())
# week_data2.head(2)

To demonstrate, we show how we call the functions to generate word embeddings and hence a semantic similarity measure using the above functions, for a subset of data.

## Process Week 18.

In [8]:
#reading the data and dividing it into parts
w18 = pd.read_parquet('netSim_processed/mayjune_w18_processed.parquet')
print(len(w18))
w18_p1,w18_p2,w18_p3 = divide_data_3_parts(w18)
print(len(w18_p1))
print(len(w18_p2))
print(len(w18_p3))
print(len(w18_p1)+len(w18_p2)+len(w18_p3))
w18['subreddit'].value_counts()

1008518
336172
336170
336176
1008518


In [9]:
w18['date'].value_counts()

In [3]:
#w18.head(2)

In [14]:
word_embeddings(18,w18_p1,1)

start generating the word embeddings
finished generation of body embeddings
336172
finished generation of receiver embeddings
336172
finished generation of word embeddings
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
336172
336172
336172
completed!


  0%|          | 0/336172 [00:00<?, ?it/s]  0%|          | 1144/336172 [00:00<00:29, 11431.59it/s]  1%|          | 2288/336172 [00:00<00:50, 6670.14it/s]   1%|          | 3075/336172 [00:00<00:47, 7064.49it/s]  1%|▏         | 4418/336172 [00:00<00:36, 9098.34it/s]  2%|▏         | 5788/336172 [00:00<00:31, 10540.40it/s]  2%|▏         | 7102/336172 [00:00<00:29, 11343.04it/s]  2%|▏         | 8397/336172 [00:00<00:27, 11835.27it/s]  3%|▎         | 9626/336172 [00:00<00:30, 10590.69it/s]  3%|▎         | 10986/336172 [00:01<00:28, 11426.34it/s]  4%|▎         | 12238/336172 [00:01<00:27, 11735.09it/s]  4%|▍         | 13620/336172 [00:01<00:26, 12335.08it/s]  4%|▍         | 15098/336172 [00:01<00:24, 13046.67it/s]  5%|▍         | 16426/336172 [00:01<00:24, 13112.74it/s]  5%|▌         | 17792/336172 [00:01<00:23, 13271.66it/s]  6%|▌         | 19189/336172 [00:01<00:23, 13478.83it/s]  6%|▌         | 20577/336172 [00:01<00:23, 13595.38it/s]  7%|▋         | 22040/336172 [00:01<0

In [4]:
#read the processed file with both network similarity and language similarity
#week = 18
#part = 1
#file_path1 = 'bothSim_Processed/mayjune_w' + str(week) + '_part'+str(part)+'_processed.parquet'
#test = pd.read_parquet(file_path1)
#test.head(2)