## This script calculates the cosine similarity between a reference text and texts in Class0 and texts in Class1. <br> Class1 texts are related to the reference text, Class0 texts are not.

In [1]:
import os
base = '/Users/Viktoria/Desktop/NLP_DocEmbeddings'
os.chdir(base)

In [2]:
import pandas as pd
import numpy as np
import re
import time
from tqdm import tqdm
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import docx2txt

from read_file import read_file
from clean_text import lemmatize
from clean_text import clean_text

pd.options.display.max_rows = 8000

In [3]:
#pip install -r requirements.txt

In [4]:
#pip install sentence-transformers

In [5]:
#pip install git+https://github.com/jbarlow83/OCRmyPDF.git

## Step 1. Access the documents

In [6]:
#Get the texts

class1 = [doc for doc in os.listdir(os.path.join(base, 'Raw_data/Class1'))]
class0 = [doc for doc in os.listdir(os.path.join(base, 'Raw_data/Class0'))]

print('Relevant documents: ', len(class1), '\nIrrelevant documents: ', len(class0))

Relevant documents:  22 
Irrelevant documents:  20


In [7]:
df = pd.DataFrame(columns=['Document', 'Class', 'Directory'])

row=0
for doc in class1:
    df.loc[row, 'Document'] = doc
    df.loc[row, 'Class'] = 1
    df.loc[row, 'Directory'] = os.path.join(base, 'Raw_data/Class1')
    row=row+1
    
for doc in class0:
    df.loc[row, 'Document'] = doc
    df.loc[row, 'Class'] = 0
    df.loc[row, 'Directory'] = os.path.join(base, 'Raw_data/Class0')
    row=row+1

In [8]:
#loop through them all at once. 

tqdm.pandas()

start = time.time()

df['Text'] = df.progress_apply(lambda x: read_file(x['Document'], x['Directory']), axis=1)

end = time.time()
print(end - start)

 90%|█████████ | 38/42 [02:30<00:11,  2.85s/it]

psr_mr18_1_3_pass-through_analysis_consultation_february_2019.pdf


100%|██████████| 42/42 [02:42<00:00,  3.87s/it]

162.3767900466919





## Step 2. Text cleaning and lemmatization

In [9]:
#A set of words we want to screen out as corpus-specific stop words, i.e. 'article', 'paragraph', etc

os.chdir(os.path.join(base, 'Documents'))

useless = docx2txt.process("useless_words.docx")
useless = re.findall(r'\w+', useless)
useless = lemmatize(useless)
useless = [u.lower() for u in useless]

In [11]:
# All non-English words will be disposed of. Keep important non-English words on this list.

informative = docx2txt.process("informative_words.docx")
informative = re.findall(r'\w+', informative)
informative = [i.lower() for i in informative]

In [None]:
#loop through them all at once.

df['Cleaned_Text'] = df['Text'].progress_apply(clean_text, args=(useless,informative))

 29%|██▊       | 12/42 [00:17<00:59,  2.00s/it]

In [None]:
df.head()

### Create a dataframe where each row is a document. Add cosine similarity score between document & set of keywords.

In [None]:
#import keywords
os.chdir(main)

with open('Stablecoin_keyphrases.txt', 'r+') as f:
    keyphrases = f.readlines()  
    
keyphrases

In [None]:
import re

keyphrases = [re.sub('\n', '', k) for k in keyphrases]

In [None]:
#how many words do we have?
keyphrases

In [None]:
df['Join_Text'] = [' '.join(text) for text in df.Cleaned_Text]

df.Join_Text[0]

In [None]:
#Get word embeddings for the keywords and the documents

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('distilbert-base-nli-mean-tokens')

for r,v in df.iterrows():
    
    text = df.loc[r, 'Join_Text']
    doc_embedding = model.encode([text])
    simlist = []
     
    for k in keyphrases:
                        
        sim = cosine_similarity(doc_embedding, model.encode([k]))
        num = float(sim)
        simlist.append(num)
        
    df.loc[r, 'Keyphrase_Similarity'] = mean(simlist)

In [None]:
df = df.drop(22)

In [None]:
# Get the mean similarity in each class

sims = df.groupby('Class')['Similarity'].mean()

In [None]:
sims

In [None]:
#Conclusion: The differences are miniscule...

### Compare to reference text

In [None]:
os.chdir(main)

#Add any txt file with reference text
with open('JMLSG Section 22.txt', 'r+') as f:
    reftext = f.readlines()

In [None]:
#Final text cleaning. This creates a single, cleaned string out of each item in the collection of texts

def prepare_extract(text):
    
    res = ' '.join([' '.join(clean_text(t)) for t in text])
        
    return(res)

In [None]:
reftext = prepare_extract(reftext)

In [None]:
reftext

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('distilbert-base-nli-mean-tokens')

for r,v in df.iterrows():
    df.loc[r, 'Join_Text'] = ' '.join(df.loc[r, 'Cleaned_Text'])

candidate_embeddings = model.encode([reftext])

all_embeddings = []

for r,v in df.iterrows():
    text = df.loc[r, 'Join_Text']
    doc_embedding = model.encode([text])
    all_embeddings.extend(doc_embedding)
    df.loc[r, 'Reftext_Similarity'] = float(cosine_similarity(doc_embedding, candidate_embeddings))

In [None]:
sims = df.groupby('Class')['Reftext_Similarity'].mean()

In [None]:
sims

### Check the statistical probability of the occurrence of keywords in the non-stablecoin texts

In [None]:
stablecoin = ' '.join(df.Join_Text[0:10])
words1 = stablecoin.split()
other = ' '.join(df.Join_Text[10:20])
words2 = other.split()

In [None]:
words1.count('stablecoin')/len(words1)*100

In [None]:
words2.count('stablecoin')/len(words2)*100

### Calculate the number of times the key phrases ocurred in the text

In [None]:
relevants = df
cleaned_data = []
cleaned_data.extend(' '.join(d for d in relevants['Cleaned_Text'][index]) for index, row in relevants.iterrows())

cv = CountVectorizer(ngram_range=(1,3))

cv.fit(cleaned_data)

cleaned_data_transformed = cv.transform(cleaned_data)

data = pd.DataFrame(cleaned_data_transformed.toarray(),
                 columns=cv.get_feature_names())

data = data.drop([col for col in data.columns if len(col.split())==1] , axis='columns')



In [None]:
data.head()

In [None]:
#import keywords
os.chdir(main)

with open('Stablecoin_keyphrases.txt', 'r+') as f:
    keyphrases = f.readlines()  

In [None]:
keyphrases = [re.sub('\n', '', k) for k in keyphrases]

In [None]:
keyphrases

In [None]:
#delete all the irrelevant collocations

data = data.drop(columns=[col for col in data.columns if col not in keyphrases])

In [None]:
data.head()

In [None]:
df['Phrases'] = data.sum(axis=1)

In [None]:
def human_readable(text):
    
    if type(text) == bytes:
        text = text.decode("utf-8") 
    text = re.findall(r'[A-Za-z0-9/./,]*', text)
    
    return text

In [None]:
df['Text'] = [' '.join(human_readable(t)) for t in df.Join_Text]

In [None]:
df.Text[0]

In [None]:
df['Phrases_Normalised'] = df.apply(lambda row: row.Phrases/len(row.Cleaned_Text)*100, axis=1)

In [None]:
df.head()

In [None]:
df = df.drop(['Text', 'Class', 'Cleaned_Text'], axis=1)

In [None]:
df = df.rename(columns = {'Human_Text': 'Text'}, inplace = False)

In [None]:
df.head()

In [None]:
df = df[['Document', 'Text', 'Phrases', 'Phrases_Normalised']]
df.head()

In [None]:
os.chdir(stable_coin)

df.to_csv('Stablecoin.csv')

### Visualise document embeddings

In [None]:
indexes = df.Document

In [None]:
all_embeddings[0].shape

In [None]:
embeds = pd.DataFrame(data=all_embeddings, index=indexes)

In [None]:
embeds.head()

In [None]:
#PCA for dimensionality reduction

#Computing the correlation matrix
X_corr=embeds.corr()

#Computing eigen values and eigen vectors
values,vectors=np.linalg.eig(X_corr)

#Sorting the eigen vectors coresponding to eigen values in descending order
args = (-values).argsort()
values = vectors[args]
vectors = vectors[:, args]

#Taking first 2 components which explain maximum variance for projecting
new_vectors=vectors[:,:2]

#Projecting it onto new dimesion with 2 axis
neww_X=np.dot(embeds,new_vectors)

In [None]:
neww_X

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(16,9))
#plt.scatter(neww_X[:,0],neww_X[:,1],linewidths=10,color='blue')
plt.xlabel("PC1",size=10)
plt.ylabel("PC2",size=10)
plt.rcParams.update({'font.size': 7})
plt.title("Word Embedding Space",size=20)
#plt.tight_layout()

vocab=list(embeds.index)
for i, word in enumerate(vocab):
    if word in stablecoin:
        #plt.annotate(word,xy=(neww_X[i,0],neww_X[i,1]), color='red')
        plt.scatter(neww_X[i,0],neww_X[i,1],linewidths=10,color='red')
    elif word in aml:
        #plt.annotate(word,xy=(neww_X[i,0],neww_X[i,1]), color='blue')
        plt.scatter(neww_X[i,0],neww_X[i,1],linewidths=10,color='blue')
    elif word in crypto:
        #plt.annotate(word,xy=(neww_X[i,0],neww_X[i,1]), color='orange')
        plt.scatter(neww_X[i,0],neww_X[i,1],linewidths=10,color='orange')
    elif word in payments:
        #plt.annotate(word,xy=(neww_X[i,0],neww_X[i,1]), color='black')
        plt.scatter(neww_X[i,0],neww_X[i,1],linewidths=10,color='black')
        
os.chdir(main)
plt.savefig('Stablecoin embedding space2.pdf')