Github Links
Backend: https://github.com/Rashmikoparde/Semantic-Blog-Search-Backend
Frontend: https://github.com/Rashmikoparde/Semantic-Search-UI

In [1]:
import os
import re
import codecs
import pandas as pd
import requests, zipfile, io
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import nltk
#nltk.download('stopwords')
from langdetect import detect, detect_langs

In [None]:
#The xml files are parsed and the posts from each blogs are collected and saved into csv
#Also posts in different language other than english are ignored

def process_data_long_text(folder_path):
    print('Processing XML files from the specified folder...')
    # Empty DataFrame
    df = pd.DataFrame(columns=['id','label', 'text', 'gender', 'age', 'zodiac'])

    
    for i, f in enumerate(os.listdir(folder_path)[5000:10000]):
        ds_id=f.split('.')[0].lower()
       # print(ds_id)
        ds_gender = f.split('.')[1].lower()
        ds_age = f.split('.')[2]
        ds_label = f.split('.')[3].lower()
        ds_zodiac = f.split('.')[4].lower()

        blog_file = BeautifulSoup(codecs.open(folder_path + '/' + f, encoding='utf-8', errors='ignore'), "lxml") 
        pk = ''

        
        for post in blog_file.find_all('post'):
            try:
                if detect(post.text) == 'en' : #Checks language of the post
                    pk = pk + post.text.strip()
            except Exception as e:
                    pass
                
        post_text = pk # Converts the list back to string.
        df = df.append({'id':ds_id,'label': ds_label, 'text': post_text, 'gender': ds_gender, 'age': ds_age, 'zodiac': ds_zodiac},ignore_index=True)
        if(i % 200 == 0):
           # print(i, 'completed')
            df.to_csv('blogdata_entire_data_new.csv')
    # Save DataFrame
    df.to_csv('blogdata_entire_data.csv')
    return 

if __name__ == "__main__":
    
    # Folder containing The Blog Authorship Corpus
    folder_path = 'D:\\MastersDKE\\Interviews\\blogs'

    # process_data_short_text(folder_path)
    process_data_long_text(folder_path)

In [3]:
#Importing csv file to dataframe
import pandas as pd

data=pd.read_csv('blogdata_entire_data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,id,label,text,gender,age,zodiac
0,0,1000331,indunk,"Well, everyone got up and going this morning. ...",female,37,leo
1,1,1000866,student,"Yeah, sorry for not writing for a whole there,...",female,17,libra
2,2,1004904,arts,"cupid,please hear my cry, cupid, please let yo...",male,23,capricorn
3,3,1005076,arts,and did i mention that i no longer have to dea...,female,25,cancer
4,4,1005545,engineering,B-Logs: The Business Blogs Paradox urlLink ...,male,25,sagittarius


In [7]:
#Dropping Duplicate ids if any
df=data
df.drop_duplicates(['id'], inplace=True)
df.shape


(1401, 7)

In [9]:
#Dropping null values from dataframe
df.dropna(inplace=True)
df.shape

(1401, 7)

In [11]:
#Drooping the column Unnamed: 0 as it is not useful
df= df.drop(["Unnamed: 0"], axis=1)
df.head()

Unnamed: 0,id,label,text,gender,age,zodiac
0,1000331,indunk,"Well, everyone got up and going this morning. ...",female,37,leo
1,1000866,student,"Yeah, sorry for not writing for a whole there,...",female,17,libra
2,1004904,arts,"cupid,please hear my cry, cupid, please let yo...",male,23,capricorn
3,1005076,arts,and did i mention that i no longer have to dea...,female,25,cancer
4,1005545,engineering,B-Logs: The Business Blogs Paradox urlLink ...,male,25,sagittarius


In [12]:
#Storing the blog post and ids to list for further training
text_list = list(df['text'])
id_list = list(df['id'])


In [13]:
from sentence_transformers import SentenceTransformer
import scipy.spatial
import pickle as pkl
import torch

#creatng a model using BERT pretrained model to embedd the text into given vectors
embedder_model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')


In [14]:
#Tried removing stopwords and special characters, but it did not help in improving similarity score.
import pickle as pkl
from spacy.lang.en import English # updated
from nltk.corpus import stopwords
import re
import nltk
nltk.download('stopwords')

corpus = text_list
prep_text_list=[]
stop_words = set(stopwords.words('english'))
i=0
for post_text in text_list:
      #  post_text =  re.sub('[^a-zA-z0-9\s]','',post_text).strip()
       # post_text = [word for word in post_text if word not in stop_words]
        #post_text = ' '.join(post_text) 
        prep_text_list.append(post_text)
        #i+=1
        #print(i)#

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
#Using above model to convert text into vector format
corpus_embeddings =  embedder_model.encode(prep_text_list,show_progress_bar=True)
print("Encoding Completed")

HBox(children=(IntProgress(value=0, description='Batches', max=44, style=ProgressStyle(description_width='init…


Encoding Completed


In [16]:
#Saving the vectors to pkl file
with open('CorpusEmbeddings2.pkl','wb') as f:
     pkl.dump(corpus_embeddings, f)
    

In [19]:
#Embedding the given query using above embedder model 
queries = ['As a first step, schools and other educational institutions are to reopen from March 8th. "The children can sit face to face with their teachers," says Johnson. The regions of Scotland and Wales , on the other hand, are relying on the gradual opening of schools, which began this week. ']
query_embeddings = embedder_model.encode(queries,show_progress_bar=True)
print(query_embeddings.shape)

# Find the closest 3 sentences of the corpus for each query sentence based on cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

closest_n = 3
print("\nTop  most similar blogs in corpus:")
for query, query_embedding in zip(queries, query_embeddings):
    

    
    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings,"cosine")[0]
    
    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    
    print("==========================Query==============================")
    print("===",query,"=====")
    print("=========================================================")

    for idx, distance in results[0:closest_n]:
        print("Score:   ", (1-distance) , "\n" )
        print("Paragraph:   ", corpus[idx].strip(), "\n" )
        row_dict = df.loc[df.index== corpus[idx]].to_dict()
        
        print("paper_id:  " , row_dict["paper_id"][corpus[idx]], "\n")
        print("Label:  " , row_dict["label"][corpus[idx]] , "\n")
        print("Gender:  " , row_dict["gender"][corpus[idx]] , "\n")
        print("Age:  " , row_dict["age"][corpus[idx]] , "\n")
        print("-------------------------------------------")
         

HBox(children=(IntProgress(value=0, description='Batches', max=1, style=ProgressStyle(description_width='initi…


(1, 768)

Top  most similar blogs in corpus:
=== As a first step, schools and other educational institutions are to reopen from March 8th. "The children can sit face to face with their teachers," says Johnson. The regions of Scotland and Wales , on the other hand, are relying on the gradual opening of schools, which began this week.  =====
Score:    0.5733158547228907 

Paragraph:    Dag nab it. Look what I found               Kindergarten classes are provided for all children of kindergarten age in the Rochester School District. Michigan Statutes stipulate that before a child can enroll in kindergarten, he or she must be five (5) years old on or before December 1.       Each spring the district conducts a Kindergarten Registration to identify and enroll youngsters who will enter kindergarten in the fall. Informational evening meetings are held in the spring for parents. Newspapers will announce the dates.       It greatly assists the district’s planning efforts if all kindergarten st

  result = op(self.values, np.asarray(other))


KeyError: False

Interpretability of the model
- We can say that the results with high similarity score are relevant and similar to the given query.
