In [None]:
import json
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
caption_comments_replies = open("/content/drive/MyDrive/Content_Similarity/EnglishDataset.json",encoding='utf-8')
caption_comments_replies = json.load(caption_comments_replies)

In [None]:
len(caption_comments_replies)

65

In [None]:
def tokenization(text):
    tokens = word_tokenize(text)
    return tokens

In [None]:
def Capitalization(text):
    text=text.lower()
    return text

In [None]:
def remove_specail_chars(text):
    text=' '.join(text)
    tokenizer=nltk.RegexpTokenizer(r"\w+")
    text = tokenizer.tokenize(text)
    return text

In [None]:
def text_cleaner(text):
    rules = [
        {r'>\s+': u'>'},  # remove spaces after a tag opens or closes
        {r'\s+': u' '},  # replace consecutive spaces
        {r'\s*<br\s*/?>\s*': u'\n'},  # newline after a <br>
        {r'</(div)\s*>\s*': u'\n'},  # newline after </p> and </div> and <h1/>...
        {r'</(p|h\d)\s*>\s*': u'\n\n'},  # newline after </p> and </div> and <h1/>...
        {r'<head>.*<\s*(/head|body)[^>]*>': u''},  # remove <head> to </head>
        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},  # show links instead of texts
        {r'[ \t]*<[^<]*?/?>': u''},  # remove remaining tags
        {r'^\s+': u''}  # remove spaces at the beginning
    ]
    for rule in rules:
        for (k, v) in rule.items():
            regex = re.compile(k)
            text = regex.sub(v, text)
            text=re.sub('https://[^ ]+','<link>',text)
            text=re.sub('http://[^ ]+','<link>',text)
            text=re.sub('@[^ ]+','<username>',text)
            text=re.sub('#[^ ]+','<hashtag>',text)
    text = text.rstrip()
    return Capitalization(text)

In [None]:
def Lemmatizing(filtered_sentence):
    
    filtered_sentence=remove_specail_chars(filtered_sentence)
    lemmatizer = WordNetLemmatizer()
    
    stemmed_Lemmatized_text=[]

    for word in filtered_sentence:
        word=lemmatizer.lemmatize(word)
        stemmed_Lemmatized_text.append(word)
        
    return stemmed_Lemmatized_text

In [None]:
def preprocess_text(text):

    stop_words = set(stopwords.words('english'))
    
    cleaned_text=text_cleaner(text)
    
    word_tokens = tokenization(cleaned_text)

    filtered_sentence = [w for w in word_tokens if not w in stop_words]

    filtered_sentence = []

    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
            
    filtered_sentence=Lemmatizing(filtered_sentence)
    
    filtered_sentenc=remove_specail_chars(filtered_sentence)

    return filtered_sentence

In [None]:
def get_captions_and_comments(index):
    
    caption = caption_comments_replies[index]['content']
    comments = caption_comments_replies[index]['comments_and_replies']
    
    return caption,comments

In [None]:
all_captions_and_comments = []

def iterate_all_captions_and_comments(total_posts,total_comments):
    try:
        for index in range(len(caption_comments_replies)):  
            caption,comments = get_captions_and_comments(index)
            
            all_comments_and_relpies=[]
            labels_comments=[]  

            for i in comments:
                
                all_comments_and_relpies.append(i['comment'])
                labels_comments.append(i['related'])
                
                for j in i['replies']:
                    all_comments_and_relpies.extend((list(j.keys())))
                    labels_comments.extend((list(j.values())))
            documents=[]
            
            filtered_caption = preprocess_text(caption)

            filtered_caption = ' '.join(filtered_caption)

            
            filtered_comments=[]
            filtered_comments_labels = []
            
            for idx in range(len(all_comments_and_relpies)):
                cur_comment = preprocess_text(all_comments_and_relpies[idx])
                # remove the comment if length is <= 2
                # drop all the words with single character
                if len(cur_comment) > 2:
                    cur_comment = ' '.join( [word for word in cur_comment if len(word)>1 ] )
                    filtered_comments.append([ index+1 , filtered_caption , cur_comment , labels_comments[idx] ])
                    # filtered_comments_labels.append(labels_comments[idx])
            
            documents.extend(filtered_comments)
                
            all_captions_and_comments.extend(documents)
            total_posts += 1 
            total_comments += len(filtered_comments)
    except Exception as e:
        print(e)
        # print(index)
        # print(all_comments_and_relpies[idx])
        print(caption_comments_replies[index])
    finally:
        return total_posts,total_comments

    
total_posts,total_comments = iterate_all_captions_and_comments(total_posts=0,total_comments=0)

print("\nTotal Posts : {}\n".format(total_posts))
print("\nTotal Comments : {}\n".format(total_comments))
# print("For 1st post :\nCaption : {}\nComments :\n".format(all_captions_and_comments[0][0] ) )
# for i in range(1,len(all_captions_and_comments[0])):
#     print("{}. {}".format(i,all_captions_and_comments[0][i]) )

df = pd.DataFrame( all_captions_and_comments , columns = ['post_number' , 'caption', 'comment' , 'label' ]  )  


Total Posts : 65


Total Comments : 22083



In [None]:
df

Unnamed: 0,post_number,caption,comment,label
0,1,nutrition food dropped significantly past 70 y...,glad husband chose live part family farm harve...,1
1,1,nutrition food dropped significantly past 70 y...,top soil planet depleted amongst thing wish co...,1
2,1,nutrition food dropped significantly past 70 y...,imagine anybody say gmo,1
3,1,nutrition food dropped significantly past 70 y...,jeff davis hi jeff,0
4,1,nutrition food dropped significantly past 70 y...,way boomer make money price lesser quality,1
...,...,...,...,...
22078,65,translator work whether pig feeling happy sad ...,translated first word eat,1
22079,65,translator work whether pig feeling happy sad ...,animal feeling time happy mood time sorrow dem...,1
22080,65,translator work whether pig feeling happy sad ...,anyone brain heart spends time animal learn un...,1
22081,65,translator work whether pig feeling happy sad ...,marry james people care division animal domest...,1


### **Cosine Similarity**

In [None]:
def Cosine_Similarity( vector1 , vector2 ):
    N = len(vector1)
    numerator = 0
    mod_x = 0
    mod_y = 0
    
    for i in range( N ):
        numerator += vector1[i]*vector2[i]
        mod_x += vector1[i]**2
        mod_y += vector2[i]**2
    
    denominator = mod_x**(1/2) * mod_y**(1/2)
    
    if denominator==0:
        return 0

    Cosine_similarity = numerator/denominator
    return Cosine_similarity
    
def calculate_cosine_similarity(caption , comment ):
    tfidfvectoriser = TfidfVectorizer()
    tfidf_vectors = tfidfvectoriser.fit_transform( [ caption , comment ] )
    vector = tfidf_vectors.toarray()
    return Cosine_Similarity( vector[0] , vector[1] )

In [None]:
import time
start = time.time()
df['cosine_simarlity'] = df.apply(lambda row : calculate_cosine_similarity(row['caption'], row['comment']) , axis = 1)
end = time.time()
print("Time for execution = {} sec".format(end-start))

Time for execution = 31.870579481124878 sec


In [None]:
df

Unnamed: 0,post_number,caption,comment,label,cosine_simarlity
0,1,nutrition food dropped significantly past 70 y...,glad husband chose live part family farm harve...,1,0.055166
1,1,nutrition food dropped significantly past 70 y...,top soil planet depleted amongst thing wish co...,1,0.000000
2,1,nutrition food dropped significantly past 70 y...,imagine anybody say gmo,1,0.000000
3,1,nutrition food dropped significantly past 70 y...,jeff davis hi jeff,0,0.000000
4,1,nutrition food dropped significantly past 70 y...,way boomer make money price lesser quality,1,0.000000
...,...,...,...,...,...
22078,65,translator work whether pig feeling happy sad ...,translated first word eat,1,0.119044
22079,65,translator work whether pig feeling happy sad ...,animal feeling time happy mood time sorrow dem...,1,0.090373
22080,65,translator work whether pig feeling happy sad ...,anyone brain heart spends time animal learn un...,1,0.016698
22081,65,translator work whether pig feeling happy sad ...,marry james people care division animal domest...,1,0.000000


In [None]:
df2 = df.drop( columns = ['label', 'post_number' ] )

In [None]:
display(df2)

Unnamed: 0,caption,comment,cosine_simarlity
0,nutrition food dropped significantly past 70 y...,glad husband chose live part family farm harve...,0.055166
1,nutrition food dropped significantly past 70 y...,top soil planet depleted amongst thing wish co...,0.000000
2,nutrition food dropped significantly past 70 y...,imagine anybody say gmo,0.000000
3,nutrition food dropped significantly past 70 y...,jeff davis hi jeff,0.000000
4,nutrition food dropped significantly past 70 y...,way boomer make money price lesser quality,0.000000
...,...,...,...
22078,translator work whether pig feeling happy sad ...,translated first word eat,0.119044
22079,translator work whether pig feeling happy sad ...,animal feeling time happy mood time sorrow dem...,0.090373
22080,translator work whether pig feeling happy sad ...,anyone brain heart spends time animal learn un...,0.016698
22081,translator work whether pig feeling happy sad ...,marry james people care division animal domest...,0.000000
