## This script calculates and plots the frequencies of 2- and 3-word collocations in text. <br> Viktoria, April 2021

In [1]:
import os
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import docx2txt
import re
import pdfplumber
import textract
import ocrmypdf
import pluggy
from tqdm import tqdm
import time
from wordcloud import WordCloud
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from statistics import mean
from statistics import stdev
from decimal import *
import requests
from bs4 import BeautifulSoup
import textract
import urllib.request, urllib.error, urllib.parse
import ocrmypdf
import pluggy
import pdfplumber
from tqdm import tqdm
from io import BytesIO
import pdfkit
import seaborn as sns
import textwrap
import random

from read_pdf import read_pdf 

pd.options.display.max_rows = 8000
pd.options.mode.chained_assignment = None

In [2]:
base = '/Users/Viktoria/Desktop/PlotCollocations'
documents = os.path.join(base, 'Documents')
results = os.path.join(base, 'Results')

In [3]:
#Go to the base directory to access the requirements file

os.chdir(base)

In [4]:
#pip install -r requirements.txt

In [5]:
#pip install git+https://github.com/jbarlow83/OCRmyPDF.git

In [6]:
#Get the texts from the different classes 

os.chdir(documents)

Class1 = os.path.join(base, 'Documents/Class_1')
Class2 = os.path.join(base, 'Documents/Class_2')
Class3 = os.path.join(base, 'Documents/Class_3')

Docs1 = [c for c in os.listdir(Class1)]
Docs2 = [c for c in os.listdir(Class2)]
Docs3 = [c for c in os.listdir(Class3)]


print('Class1: ', len(Docs1), '\nClass2: ', len(Docs2), '\nClass3: ', len(Docs3))

Class1:  10 
Class2:  11 
Class3:  12


In [7]:
# Articificially balance the classes: take the same number at random
# Or : tweak the ratio according to the ratio expected in the population

D1 = dict.fromkeys(Docs1, 1)
#D1 = random.choices(list(D1.items()), k = 10)

D2 = dict.fromkeys(Docs2, 2)
#D2 = random.choices(list(D2.items()), k = 10)

D3 = dict.fromkeys(Docs3, 3)
#D3 = random.choices(list(D3.items()), k = 10)

#create a dataset where the keys are the document names, and the values are the class
data = {**D1, **D2, **D3} 

In [8]:
#loop through them all at once. 200 docs takes ~10mins

df = pd.DataFrame(columns=['Document', 'Text', 'Class'])

df['Document'] = data.keys()
df['Class'] = data.values()   

def get_directory(class_id):
    
    if class_id == 1:
        directory = Class1
    elif class_id == 2:
        directory = Class2
    elif class_id == 3:
        directory = Class3
        
    return directory

df['Dir'] = df['Class'].apply(get_directory)

df.head()

Unnamed: 0,Document,Text,Class,Dir
0,European Parliament Regulation on Crypto-asset...,,1,/Users/Viktoria/Desktop/PlotCollocations/Docum...
1,FATF Report to G20 on so-called stable coins.pdf,,1,/Users/Viktoria/Desktop/PlotCollocations/Docum...
2,FSB Global Stablecoins.pdf,,1,/Users/Viktoria/Desktop/PlotCollocations/Docum...
3,G7 working group on stable coins.pdf,,1,/Users/Viktoria/Desktop/PlotCollocations/Docum...
4,JMLSG-Guidance.pdf,,1,/Users/Viktoria/Desktop/PlotCollocations/Docum...


In [None]:
start = time.time()

df['Text'] = [m for m in map(read_pdf, tqdm(df.Document), df.Dir)]
      
end = time.time()
print(end - start)

  3%|▎         | 1/33 [00:17<09:34, 17.95s/it]

processed pdf


  6%|▌         | 2/33 [00:22<05:03,  9.80s/it]

processed pdf


  9%|▉         | 3/33 [00:26<03:45,  7.50s/it]

processed pdf


 12%|█▏        | 4/33 [00:31<03:04,  6.37s/it]

processed pdf


 15%|█▌        | 5/33 [01:12<08:45, 18.77s/it]

processed pdf


 18%|█▊        | 6/33 [01:29<08:09, 18.12s/it]

processed pdf


 21%|██        | 7/33 [01:34<05:59, 13.81s/it]

processed pdf


 24%|██▍       | 8/33 [01:37<04:26, 10.65s/it]

processed pdf


 27%|██▋       | 9/33 [01:42<03:32,  8.87s/it]

processed pdf


In [None]:
#Delete the silly characters from the beginning of the title until the first capital letter

def replace_silly(string):

    string = re.sub(r'^[^A-Z.-]+\s*', '', string)
    
    return string

In [None]:
df['Document']=df['Document'].apply(replace_silly)

### Text cleaning and lemmatization

In [None]:
def lemmatize(content_as_words):
    
    lemmatizer = WordNetLemmatizer()

    def get_wordnet_pos(word):
        #Map POS tag to first character lemmatize() accepts
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)

    lemma = []
    subset = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in content_as_words]
    #sentence = ' '.join(subset)
    lemma.extend(subset)
    
    return lemma

In [None]:
def clean_text(content):
    
    #initial text cleaning
    if type(content) == bytes:
        content = content.decode("utf-8") 
    content = re.findall(r'[a-zA-Z]+', content)
    content = [c.lower() for c in content]
    
    #lemmatize. this will return a list of 1 item: the lemmatized text as a string
    lemma = lemmatize(content)

    #get rid of non-English words
    en_words = set(nltk.corpus.words.words())
    stop_words = [s for s in stopwords.words('english') if s not in informative]
    
    #get rid of the whole thing if not in English (ratio of en_words is < 40%)
    num_words = len(content)
    content = [c for c in content if c in en_words or c in informative]
    
    if num_words == 0 or len(content)/num_words < 0.4:
        content = ''
    else:
        #remove stop words
        content = [c for c in content if c not in stop_words and c not in useless]
        #remove short words
        content = [c for c in content if len(c)>=3]
        
    return content

In [None]:
#A set of words we want to screen out as corpus-specific stop words, i.e. 'article', 'paragraph', etc
os.chdir(models)

result = docx2txt.process("useless_words.docx")
useless = re.findall(r'\w+', result)
useless = lemmatize(useless)
useless = [u.lower() for u in useless]

In [None]:
# All non-English words will be disposed of. Keep important non-English words on this list.
os.chdir(models)

result = docx2txt.process("informative_words.docx")
informative = re.findall(r'\w+', result)
informative = [i.lower() for i in informative]

In [None]:
#loop through them all at once.

df['Cleaned_Text'] = [m for m in map(clean_text, tqdm(df.Text))]

In [None]:
df.head()

In [None]:
#check if cleaned_text is a neatly ordered list of strings (rather than one long string)
if type(df.Cleaned_Text[0])==str:
    df['Cleaned_Text'] = [eval(df.loc[i, 'Cleaned_Text']) for i,v in df.iterrows()]

### Calculate the number and proportion of keyphrases in each text.

In [None]:
#import keywords
os.chdir(coin)

with open('stablecoin_keyphrases.txt', 'r+') as f:
    keyphrases = f.readlines()  
    keyphrases = [re.sub('\n', '', k) for k in keyphrases]
    keyphrases = [k.lower() for k in keyphrases]


In [None]:
#how many words do we have?
len(keyphrases)

In [None]:


os.chdir(coin)

word_ls = keyphrases
word_could_dict = Counter(word_ls)
wordcloud = WordCloud(background_color="white").generate_from_frequencies(word_could_dict)
wordcloud.to_file("keyphrases.png")

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
df['Text'] = [text.decode("utf-8") for text in df.Text if type(text) == bytes]

In [None]:
a=0
for r,v in df.iterrows():
    
    if 'stablecoin' in df.loc[r, 'Text']:
        print('found', r)
        a=a+1
        continue
        

In [None]:
df.loc[7, 'Document']

In [None]:
df.loc[8, 'Document']

### Calculate the number of times the key phrases ocurred in the text

In [None]:
for r,v in df.iterrows():
    
    num = 0

    for k in keyphrases:
        num = num + df.loc[r, 'Text'].lower().count(k)
        
    df.loc[r, 'Num_Phrases'] = num
    df.loc[r, 'Prop_Phrases'] = num/len(df.loc[r, 'Cleaned_Text'])*100
        

In [None]:
df['Document'] = [re.sub(r'.pdf', '', d) for d in df.Document]

for r,v in df.iterrows():
    if df.loc[r, 'Class'] == 1:
        df.loc[r, 'Colour'] = 'red'
    elif df.loc[r, 'Class'] == 2:
        df.loc[r, 'Colour'] = 'blue'
    elif df.loc[r, 'Class'] == 3:
        df.loc[r, 'Colour'] = 'yellow'
    elif df.loc[r, 'Class'] == 4:
        df.loc[r, 'Colour'] = 'green'

### Plot the poroportion of documents as a bar chart

In [None]:
df.loc[14, 'Document'] = 'Consultation on cooperation and information exchange for AML CFT supervisory purposes'

In [None]:
df.loc[15, 'Document'] = 'Consultation on the implementation of group wide AML-CFT policies in third countries'

In [None]:


os.chdir(coin)

plt.figure(figsize=(20,10))

#plt.bar(range(len(docs_phrases)), list(docs_phrases.values()), align='center', width=0.5, color=colours)
#plt.xticks(range(len(docs_phrases)), list(docs_phrases.keys()), rotation="vertical")

plt.bar(df.Document, df.Prop_Phrases, align='center', width=0.5, color=df.Colour)
plt.xticks(range(len(df)), df.Document.to_list(), rotation=90)
plt.yticks(range(0,22,2))

plt.title('Proportion of stablecoin-related keyphrases', fontsize = 20)

red = mpatches.Patch(color='red', label='Stablecoin')
blue = mpatches.Patch(color='blue', label='AML')
yellow = mpatches.Patch(color='yellow', label='Crypto')
green = mpatches.Patch(color='green', label='Payments')

plt.legend(loc=7, fontsize = 'large', handles=[red, blue, yellow, green])

plt.tight_layout() #to make sure the text is legible 

plt.savefig('Proportion of stablecoin-related keyphrases OLD.pdf')

### Now do some stats

In [None]:
#Import all the labelled data (from newsletters)
os.chdir(data_repo)

all_data = pd.read_csv('Classifiers_TrainingData.csv')
all_data = all_data.loc[all_data['Cleaned_Text'].str.len()>=50]

In [None]:
#make sure cleaned_text is an orderly list of strings

if type(all_data.loc[0, 'Cleaned_Text']) == str:
    all_data['Cleaned_Text'] = [eval(all_data.loc[i, 'Cleaned_Text']) for i,v in all_data.iterrows()]

In [None]:
all_data.drop(all_data.columns.difference(['Document', 'Text', 'Class', 'Topic', 'Cleaned_Text']), 1, inplace=True)
all_data = all_data.reset_index(drop=True)
all_data.head()

In [None]:
relevants = all_data[all_data.Class==1]

In [None]:
relevants=relevants.reset_index(drop=True)
relevants.head()

In [None]:
relevants['Document'] = [re.sub(r'.pdf', '', d) for d in relevants.Document]

for r,v in relevants.iterrows():
    if relevants.loc[r, 'Topic'] == 'stablecoin':
        relevants.loc[r, 'Colour'] = 'red'
    elif relevants.loc[r, 'Topic'] == 'Anti_money':
        relevants.loc[r, 'Colour'] = 'blue'
    elif relevants.loc[r, 'Topic'] == 'Crypto':
        relevants.loc[r, 'Colour'] = 'yellow'
    elif relevants.loc[r, 'Topic'] == 'Payments':
        relevants.loc[r, 'Colour'] = 'green'

In [None]:
len(relevants)

In [None]:
#df.Document.isin(relevants.Document).astype(int)

In [None]:
for r,v in relevants.iterrows():
    
    num = 0

    for k in keyphrases:
        num = num + relevants.loc[r, 'Text'].lower().count(k)
        
    relevants.loc[r, 'Num_Phrases'] = num
    relevants.loc[r, 'Prop_Phrases'] = num/len(relevants.loc[r, 'Cleaned_Text'])*100
        

In [None]:
relevants = relevants.reset_index(drop=True)
relevants.head()

In [None]:
stablecoin = df.groupby('Class')['Prop_Phrases'].agg(['mean', 'std'])
stablecoin

In [None]:
stablecoin.iloc[0,0]

In [None]:
stablecoin.iloc[0,1]

In [None]:


summarise = pd.DataFrame()

stats = relevants.groupby('Topic')['Prop_Phrases'].agg(['mean', 'std'])

summarise['Means'] = list(stats.iloc[:,0])
summarise['SD'] = list(stats.iloc[:,1])
summarise['Topic'] = ['AML', 'Crypto', 'Payments']
summarise['Colour'] = ['blue', 'yellow', 'green']
summarise.loc[-1] = [stablecoin.iloc[0,0], stablecoin.iloc[0,1], 'Stablecoin', 'red']
summarise.index = summarise.index+1
summarise = summarise.sort_index()

summarise

In [None]:

os.chdir(coin)

plt.figure(figsize=(12,8))

for s in range(0, len(summarise.Means)):
    summarise.Means[s] = float(Decimal(str(summarise.Means[s])).quantize(Decimal('.001'), rounding=ROUND_DOWN))
    
    
plt.bar(summarise.Topic, summarise.Means, align='center', width=0.5, color=summarise.Colour, yerr=summarise.SD)
plt.xticks(range(len(summarise)), summarise.Topic.to_list(), rotation="vertical")

plt.title('Average proportion of keyphrases per topic', fontsize = 20)

red = mpatches.Patch(color='red', label='Stablecoin')
blue = mpatches.Patch(color='blue', label='AML')
yellow = mpatches.Patch(color='yellow', label='Crypto')
green = mpatches.Patch(color='green', label='Payments')

plt.legend(loc=7, fontsize = 'large', handles=[red, blue, yellow, green])

def addlabels(x,y):
    for i in range(len(summarise.Topic)):
            plt.text(i,y[i],y[i])
            
addlabels(summarise.Topic, summarise.Means)


plt.tight_layout() #to make sure the text is legible 

plt.savefig('Average proportion of keyphrases per topic.pdf')


### Is 2 a good threshold?

In [None]:
relevants.head()

In [None]:
len(relevants[relevants.Topic=='Payments'])

In [None]:
len(relevants[(relevants.Topic=='Payments') & (relevants.Prop_Phrases>2)])

In [None]:
len(relevants[relevants.Topic=='Anti_money'])

In [None]:
len(relevants[(relevants.Topic=='Anti_money') & (relevants.Prop_Phrases>2)])

In [None]:
len(relevants[relevants.Topic=='Crypto'])

In [None]:
len(relevants[(relevants.Topic=='Crypto') & (relevants.Prop_Phrases>2)])

### Download documents with > 2% keyphrases

In [None]:
### Import the new documents
os.chdir(models)

data = pd.read_csv('Classifiers_TestData.csv')
data.drop(data.columns.difference(['Source','Document', 'URL', 'Text', 'Cleaned_Text']), 1, inplace=True)

#make sure cleaned_text is an orderly list of strings
if type(data.loc[0, 'Cleaned_Text']) == str:
     data['Cleaned_Text'] = [eval(data.loc[i, 'Cleaned_Text']) for i,v in data.iterrows()]
        
data.head()

In [None]:
len(data)

In [None]:
#Calculate the proportion of keyphrases

for r,v in data.iterrows():
    
    num = 0

    for k in keyphrases:
        num = num + data.loc[r, 'Text'].lower().count(k)
        
    data.loc[r, 'Num_Phrases'] = num
    data.loc[r, 'Prop_Phrases'] = num/len(data.loc[r, 'Cleaned_Text'])*100
        

In [None]:
len(data)

In [None]:
candidates = data[data.Prop_Phrases>2]
candidates = candidates.reset_index(drop=True)

In [None]:
len(candidates)

In [None]:
#Create the table that will be saved as the output

def human_readable(text):
    
    if type(text) == bytes:
        text = text.decode("utf-8") 
        
    #remove the silly characters from the beginning of the text
    text = re.sub(r'^[^A-Z.-]+\s*', '', text)
    
    text = re.sub('\s[n]\s', '', text) #remove the ns from the newlines
    
    text = re.sub('\s[n]\s', '', text) #remove the ns from the newlines
    
    text = re.sub(r'[^a-zA-Z0-9.*]', ' ', text)
    text = text[0:1000]
        
    #only keep meaningful characters
   # text = re.findall(r'[A-Za-z0-9/./,]*', text)
    
    return text

In [None]:
candidates.head()

In [None]:
os.chdir(results)

#Drop the machine processed text 
output = candidates.drop(['Cleaned_Text', 'Text'], axis=1) 

#Add human-readable text
output['Text'] = [''.join(human_readable(t)) for t in candidates.Text] 
output['Source'] = [re.sub('.csv', '', sources) for sources in output.Source]

In [None]:
output.to_csv('Stablecoin candidates.csv')

### Now access the URLs of the relevant documents and save them in a folder 

In [None]:
def process_pdf(title_words, response, soup, download):
    
    open('myfile.pdf', 'wb').write(response.content)
                    
    with pdfplumber.open('myfile.pdf') as pdf:
        
        page = pdf.pages[0]
        text = page.extract_text()
            
        if text==None: #scanned pdf
            if __name__ == '__main__':
                ocrmypdf.ocr('myfile.pdf', 'myfile_converted.pdf', deskew=True, progress_bar = False)
                content = textract.process('myfile_converted.pdf', method='pdfminer') #pdf
                os.remove('myfile_converted.pdf')
                            
        else:
            content = textract.process('myfile.pdf', method='pdfminer') #pdf
            
    if  download == 1:
        os.rename('myfile.pdf', title_words + '.pdf')
    else:
        os.remove('myfile.pdf')
            
    return content

In [None]:
def process_html(title_words, response, soup):
    
    for script in soup(["script", "style"]):
        script.extract()    # rip it out
        
    content = soup.get_text()
    #print('processed html')
    
    return content

In [None]:
def download_document(title, url):
    
    directory = os.path.join(results, 'Stablecoin')
    os.chdir(directory)
    
    name = re.sub(r'\W+', ' ', title)
    name = re.sub(r'pdf', '', name)
    name = re.sub('^\s*', '', name)
    name = re.sub('\s*$', '', name)
    name = re.sub('r[:?!]', '', name)
    title_words = name.split()
    title_words = ' '.join(title_words)

    try:
        
        #word
        if '.docx' in url:
            
            docx = BytesIO(requests.get(url).content)
            content = docx2txt.process(docx)
            pdfkit.from_string(content, title_words + '.pdf')
            print('processed word')
            
        else:
            response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'})
            soup = BeautifulSoup(response.text, 'html.parser')
        
            #pdf
            if '.pdf' in url or 'PDF' in soup.text[0:50]:
                content = process_pdf(title_words, response, soup, 1)
                print('processed pdf')
        
            #probably html
            else:
                content = process_html(title_words, response, soup)
                pdfkit.from_url(url, title_words + '.pdf')
                print('processed html')
        
    except:
        
        content = 'webpage error'
        print(title, ' ', url)
        
    return content
            

In [None]:


os.chdir(scraper + '/Documents_downloaded')

start = time.time()

    
candidates['Text'] = [m for m in map(download_document, tqdm(candidates.Document), candidates.URL)]
    
    
end = time.time()
print(end - start)

### Plot the documents to show which ones are most likely to be stablecoin-related

In [None]:
candidates.head()

In [None]:
def pretty_title(title):  

    name = re.sub(r'\W+', ' ', title)
    name = re.sub(r'pdf', '', name)
    name = re.sub('^\s*', '', name)
    name = re.sub('\s*$', '', name)
    title_words = name.split()
    title_words = ' '.join(title_words)

    return title_words

In [None]:
candidates['Document'] = [m for m in map(pretty_title, candidates.Document)]

In [None]:
candidates = candidates.sort_values('Num_Phrases', axis=0, ascending=False)

In [None]:


os.chdir(results)

plt.figure(figsize=(30,50))
sns.set(font_scale=1.3)

ax = sns.barplot(x="Num_Phrases", y="Document", data=candidates, palette="Reds_r")

plt.xticks(rotation=90)

max_width = 50
ax.set_yticklabels(textwrap.fill(y.get_text(), max_width) for y in ax.get_yticklabels())

plt.title("Number of keyphrases in stablecoin-related documents", size=50, pad=50)

ttl = ax.title
ttl.set_position([.5, 1.01])


plt.xlabel("")
plt.ylabel("")

def roundup(x):
    return x if x % 100 == 0 else x + 100 - x % 100

ax.set_xticks(range(0, roundup(int(max(candidates.Num_Phrases))), 100))
ax.set_xticklabels(ax.get_xticks(), size = 20)
ax.xaxis.set_ticks_position('top')

plt.tight_layout()

plt.savefig('Stablecoin-related documents.pdf')

### Check stablecoin keyphrases in segments of documents

In [None]:
os.chdir(os.path.join(coin, 'Documents'))

In [None]:
texts = [t for t in os.listdir(os.path.join(coin, 'Documents')) if '.txt' in t]

In [None]:
texts

In [None]:
extracts = pd.DataFrame(columns=['Document', 'Subtitle', 'Text'])

r=0
for t in texts:

    with open(t, 'r+') as f:
        chunk = f.read()  
        chunk = clean_text(chunk)
        extracts.loc[r, 'Text'] = chunk
        extracts.loc[r, 'Subtitle'] = t
        if 'JMLSG' in t:
            extracts.loc[r, 'Document'] = 'JMLSG Guidance'
        elif 'UK' in t:
            extracts.loc[r, 'Document'] = 'UK AML Regulations 2017'
        r=r+1
        

In [None]:
extracts.head()

In [None]:
#Calculate the proportion of keyphrases

for r,v in extracts.iterrows():
    
    num = 0

    for k in keyphrases:
        num = num + extracts.loc[r, 'Text'].count(k)
        
    extracts.loc[r, 'Num_Phrases'] = num
    extracts.loc[r, 'Prop_Phrases'] = num/len(extracts.loc[r, 'Text'])*100
    
    if extracts.loc[r, 'Document'] == 'JMLSG Guidance':
        extracts.loc[r, 'Colour'] = 'firebrick'
    elif extracts.loc[r, 'Document'] == 'UK AML Regulations 2017':
        extracts.loc[r, 'Colour'] = 'tomato'
        
    extracts.loc[r, 'Subtitle'] = re.sub(r'.txt', '', extracts.loc[r, 'Subtitle'])
        

In [None]:
extracts.head()

In [None]:


os.chdir(coin)

plt.figure(figsize=(20,10))

#plt.bar(range(len(docs_phrases)), list(docs_phrases.values()), align='center', width=0.5, color=colours)
#plt.xticks(range(len(docs_phrases)), list(docs_phrases.keys()), rotation="vertical")

plt.bar(extracts.Subtitle, extracts.Prop_Phrases, align='center', width=0.5, color=extracts.Colour)
plt.xticks(range(len(extracts)), extracts.Subtitle.to_list(), rotation=90)

plt.title('Stablecoin-related keyphrases in extracts', fontsize = 20)

red = mpatches.Patch(color='firebrick', label='JMLSG Guidance Section 22.')
tom = mpatches.Patch(color='tomato', label='UK AML Regulations 2017')


plt.legend(loc=7, fontsize = 'large', handles=[red, tom])

plt.tight_layout() #to make sure the text is legible 

plt.savefig('Stablecoin-related keyphrases in extracts.pdf')