# Full Text Analysis with BERT

### Requirements

In [3]:
# Possible Installations
# !pip install sentence_transformers
# # !pip install transformers
# !pip install spacy

In [4]:
import collections
import re
import unicodedata

import nltk
# from collections.abc import Mapping
# from collections.abc import Mapping

import spacy
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer, sent_tokenize, word_tokenize
from nltk.tokenize.toktok import ToktokTokenizer

nltk.download('punkt')

import io
import itertools
import os
import shutil
import string

import contractions
import gensim.downloader as api
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import torch
import torchvision
from gensim.models import Word2Vec
from pdfminer3.converter import PDFPageAggregator, TextConverter
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer3.pdfpage import PDFPage
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from spacy.lang.en import English
from transformers import pipeline

import sys
sys.path.append(r"..")
from nlp_functions import (classifier, remove_colons, remove_digits, remove_n,
                           remove_redundant_whitespaces,
                           remove_strange_characters, remove_stripes,
                           text_loader)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Preprocessing

In [5]:
def preprocessing(text):
    # mapping = str.maketrans('', '', string.digits)
    # text = text.translate(mapping)

    text = remove_strange_characters(text)
    text = re.sub(r'\d+', '', text)


    text = remove_n(text)

    text = remove_colons(text)

    text = text.replace(r" .",".")
    text = text.replace(r"..",".")
    text = text.replace(r"...",".")

    text = remove_stripes(text)

    text = remove_redundant_whitespaces(text)
    return text

In [7]:
nltk.download('omw-1.4')
nltk.download('wordnet')

def lemmatize_words(text):

    lemmatizer = WordNetLemmatizer()
    text = word_tokenize(text)

    lemma_list = []

    for word in text:
        lemma_word = lemmatizer.lemmatize(word)
        lemma_list.append(lemma_word)

    lemma_text = ' '.join(lemma_list)


    return lemma_text



[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
def remove_small_tokens(prep_text):
    prep_text_before = prep_text
    print(f"Text before cleaning:  {len(prep_text_before)}")
    print(type(prep_text_before))
    # prep_text_before = [prep_text_before]
    prep_text_before = word_tokenize(prep_text_before)
    # print(prep_text_before)

    for word in prep_text_before:
        if len(word) <= 3:
            prep_text_before.remove(word)
    print(f"Text after cleaning:  {len(prep_text_before)}")
    prep_text_list = ' '.join(prep_text_before)
    print(type(prep_text_list))


    return prep_text_list


In [9]:
# lemma_text_cleaned = remove_small_tokens(lemma_text)
# stem_p_text_cleaned = remove_small_tokens(porter_stem_text)
# snowball_stem_text_cleand = remove_small_tokens(snowball_stem_text)

## Part 1: Bag of Words

In [10]:
def get_top_n_words(corpus, n=None):
    corpus=[corpus]

    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    top_n_words_df = pd.DataFrame(words_freq, columns =['Word', 'Count'])

    return top_n_words_df.loc[:(n-1)]



# Part 2: TF_IDF

In [11]:
# !pip3 install scikit-learn
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer


In [12]:
def get_tf_idf(text, n=None):
    text = re.findall(r'(?:\d[.]|[^.])*(?:[.]|$)', text)
    tfIdfVectorizer=TfidfVectorizer(use_idf=True)
    tfIdf = tfIdfVectorizer.fit_transform(text)
    df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
    df = df.sort_values('TF-IDF', ascending=False)
    return (df[:(n)])

# Auto Script

In [16]:
def pdf_loader(company_name):
    company_name = company_name[:-4]
    source = r'..\Data\Nachhaltigkeitsberichte\Alle'
    path = rf"{source}\{company_name}.pdf"

    text = text_loader(path)

    return text, company_name, path, source

In [18]:
list_of_companies = os.listdir(
    r"..\Data\Nachhaltigkeitsberichte\Alle")
    
for company in list_of_companies:
    ## BOW / TF-IDF
    print(f"We process now {company}")

    text_1, company_name, path, source = pdf_loader(company)
    prep_text = preprocessing(text_1)
    lemma_words = lemmatize_words(prep_text)
    text_cleaned = remove_small_tokens(lemma_words)
    top_n_words = get_top_n_words(text_cleaned, 40)
    top_n_words = list(top_n_words.itertuples(index=False, name=None))
    top_n_words
    tf_idf = get_tf_idf(text_cleaned, 40)
    tf_idf = tf_idf.reset_index()
    tf_idf = tf_idf.rename(columns={"index": "Word"})
    idf_list = list(tf_idf.itertuples(index=False, name=None))
    idf_list
    data = [(company_name, idf_list, top_n_words)]
    df1 = pd.DataFrame(data, columns = ["PDF Name", "TF-IDF", "Top N Words"])
    # print(df1)
    company_name_new = f"{df1['Top N Words'][0][0][0]}_{df1['Top N Words'][0][1][0]}_{df1['Top N Words'][0][2][0]}"
    print(company_name_new)
    df1["Company Name"] = company_name_new
    df1.to_csv(fr"..\Data\Resultate\Testfolder\TF-IDF 40\{company_name_new}_bow_tf_ifd.csv")
    print("SAFED")

We process now 4f303cec-a12d-480b-accb-7b56f706f60e_axa-ri2020-en-accessible.pdf
Text before cleaning:  104806
<class 'str'>
Text after cleaning:  12738
<class 'str'>
axa_health_customer
SAFED
We process now 4f391131-ad12-ab53-7265-5e6c88840627.pdf




Text before cleaning:  2458
<class 'str'>
Text after cleaning:  265
<class 'str'>
global_safety_training
SAFED
We process now 5ZmOsI2P3oe0plCvOThrCySgcDcKCXqj.pdf




Text before cleaning:  130968
<class 'str'>
Text after cleaning:  13055
<class 'str'>




die_lindt_sprüngli
SAFED
We process now 9clin.pdf
Text before cleaning:  206865
<class 'str'>
Text after cleaning:  23452
<class 'str'>




chemical_mitsubishi_employee
SAFED
We process now 10_BCGB20_SustainabilityReport_E_Web.pdf
Text before cleaning:  36893
<class 'str'>
Text after cleaning:  4294
<class 'str'>
cid_burckhardt_compression
SAFED
We process now 668a8a26-d924-21aa-cd75-c12e5296ff2b.pdf




Text before cleaning:  291499
<class 'str'>
Text after cleaning:  34295
<class 'str'>




bcge_financial_board
SAFED
We process now 1212.pdf
Text before cleaning:  140713
<class 'str'>
Text after cleaning:  16272
<class 'str'>
bell_food_group
SAFED
We process now 1313.pdf




Text before cleaning:  140713
<class 'str'>
Text after cleaning:  16272
<class 'str'>
bell_food_group
SAFED
We process now 2012-CSR-report.pdf




Text before cleaning:  54242
<class 'str'>
Text after cleaning:  6278
<class 'str'>
dsv_employee_target
SAFED
We process now 2017-Glencore-Sustainability-Report-FINAL-.pdf




Text before cleaning:  303513
<class 'str'>
Text after cleaning:  34163
<class 'str'>




community_glencore_report
SAFED
We process now 2019_Straumann_sustainability_report.pdf
Text before cleaning:  57116
<class 'str'>
Text after cleaning:  6619
<class 'str'>
dental_report_production
SAFED
We process now 2019-sustainability-report-doc-en.pdf




Text before cleaning:  264417
<class 'str'>
Text after cleaning:  29701
<class 'str'>




swiss_risk_sustainability
SAFED
We process now 2020_dnf_-_eng_0.pdf
Text before cleaning:  129857
<class 'str'>
Text after cleaning:  14949
<class 'str'>




group_autogrill_management
SAFED
We process now 2020_Sustainability_Report.pdf
Text before cleaning:  97164
<class 'str'>
Text after cleaning:  11327
<class 'str'>




braun_employee_management
SAFED
We process now 2020_valora_geschaeftsbericht_de.pdf
Text before cleaning:  390317
<class 'str'>
Text after cleaning:  44495
<class 'str'>




valora_financial_group
SAFED
We process now 2020-Annual-Report-7u42lsu22.pdf
Text before cleaning:  796061
<class 'str'>
Text after cleaning:  91298
<class 'str'>




temenos_financial_year
SAFED
We process now 2020-responsibility-highlights-report.pdf
Text before cleaning:  88964
<class 'str'>
Text after cleaning:  10616
<class 'str'>
vifor_pharma_patient
SAFED
We process now 2020-sustainability-report-doc-en.pdf




Text before cleaning:  282964
<class 'str'>
Text after cleaning:  32329
<class 'str'>




swiss_sustainability_risk
SAFED
We process now 2021-03-22_JuliusBaer_CorporateSustainabilityReport2020_EN.pdf
Text before cleaning:  185653
<class 'str'>
Text after cleaning:  21095
<class 'str'>
sustainability_risk_report
SAFED
We process now 210928_UBP20Sustainability20Report.pdf




Text before cleaning:  134972
<class 'str'>
Text after cleaning:  15146
<class 'str'>
ubp_sustainability_investment
SAFED
We process now 2641133_DOWNLOAD.pdf




Text before cleaning:  312572
<class 'str'>
Text after cleaning:  36055
<class 'str'>




football_uefa_social
SAFED
We process now 20200625_man-es_pr_cr-report_2020_en.pdf
Text before cleaning:  5082
<class 'str'>
Text after cleaning:  586
<class 'str'>
energy_solutions_company
SAFED
We process now abb-group-sustainability-performance-report-2015.pdf




Text before cleaning:  211859
<class 'str'>
Text after cleaning:  24337
<class 'str'>




abb_sustainability_performance
SAFED
We process now Allianz_Group_Sustainability_Report_2020-web (1).pdf
Text before cleaning:  512216
<class 'str'>
Text after cleaning:  56224
<class 'str'>




sustainability_allianz_risk
SAFED
We process now Allianz_Group_Sustainability_Report_2020-web.pdf
Text before cleaning:  512216
<class 'str'>
Text after cleaning:  56224
<class 'str'>




sustainability_allianz_risk
SAFED
We process now Allreal_Sustainability_Report_EN_2020.pdf
Text before cleaning:  45230
<class 'str'>
Text after cleaning:  5136
<class 'str'>




allreal_report_employee
SAFED
We process now AMAG_Geschaeftsbericht_2021_Magazin_Englisch.pdf
Text before cleaning:  79932
<class 'str'>
Text after cleaning:  9681
<class 'str'>
amag_aluminium_austria
SAFED
We process now Ancient Rome Did Not Fall_ Why Real Story is Even Scarier for America and How It Connects to Billionaires _ by Barry Gander _ Nov, 2022 _ Medium.pdf




Text before cleaning:  13761
<class 'str'>
Text after cleaning:  1656
<class 'str'>
america_rome_roman
SAFED
We process now annual-and-sustainability-report-2020.pdf




Text before cleaning:  697907
<class 'str'>
Text after cleaning:  82868
<class 'str'>




volvo_group_financial
SAFED
We process now annual-report-2020.pdf
Text before cleaning:  699029
<class 'str'>
Text after cleaning:  81215
<class 'str'>




easyjet_year_million
SAFED
We process now apollo-esg-report-v-12.pdf
Text before cleaning:  237730
<class 'str'>
Text after cleaning:  27745
<class 'str'>




company_apollo_employee
SAFED
We process now ar21e.pdf
Text before cleaning:  326751
<class 'str'>
Text after cleaning:  37642
<class 'str'>




roche_committee_year
SAFED
We process now avaloq-csr-2020-report.pdf
Text before cleaning:  36255
<class 'str'>
Text after cleaning:  4171
<class 'str'>




business_avaloq_report
SAFED
We process now Avoid the Reorg from Hell with Six Key Principles _ by Saumil Mehta _ Nov, 2022 _ Medium.pdf
Text before cleaning:  18439
<class 'str'>
Text after cleaning:  2288
<class 'str'>
team_manager_design
SAFED
We process now Bucher Sustainability report 2020.pdf




Text before cleaning:  90618
<class 'str'>
Text after cleaning:  10098
<class 'str'>
bucher_industries_employee
SAFED
We process now Buckingham Palace race row raises awkward questions - BBC News.pdf




Text before cleaning:  6649
<class 'str'>
Text after cleaning:  825
<class 'str'>
palace_race_buckingham
SAFED
We process now c05179523.pdf




Text before cleaning:  51534
<class 'str'>
Text after cleaning:  5953
<class 'str'>
hp_product_impact
SAFED
We process now cargill-aqua-nutrition-sustainability-report.pdf




Text before cleaning:  117909
<class 'str'>
Text after cleaning:  14139
<class 'str'>
feed_cargill_nutrition
SAFED
We process now celgene-responsibility.pdf




Text before cleaning:  113573
<class 'str'>
Text after cleaning:  12559
<class 'str'>
celgene_report_employee
SAFED
We process now ClariantIntegratedReport2020EN.pdf




Text before cleaning:  354493
<class 'str'>
Text after cleaning:  39507
<class 'str'>




clariant_business_clariants
SAFED
We process now coca-cola-business-environmental-social-governance-report-2020.pdf
Text before cleaning:  253220
<class 'str'>
Text after cleaning:  29343
<class 'str'>




company_water_cocacola
SAFED
We process now COOP_NHB_2011_e_low.pdf
Text before cleaning:  194785
<class 'str'>
Text after cleaning:  22792
<class 'str'>




coop_sustainability_product
SAFED
We process now Corporate_Sustainability_Report_2019_Web.pdf
