In [1]:
import collections
import io
import itertools
import os
import re
import shutil
import string
import unicodedata

import contractions
import gensim.downloader as api
import matplotlib
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import plotly.express as px
import spacy
import torch
import torchvision
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer, sent_tokenize, word_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from pdfminer3.converter import PDFPageAggregator, TextConverter
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer3.pdfpage import PDFPage
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from spacy.lang.en import English
from transformers import pipeline

sys.path.append(r"..")

from nlp_functions import (classifier, remove_colons, remove_digits, remove_n,
                           remove_redundant_whitespaces,
                           remove_strange_characters, remove_stripes,
                           text_loader)

nltk.download('punkt')

from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# import collections
# import io
# import itertools
# import os
# import re
# import string
# import unicodedata

# import contractions

# import pandas as pd
# # import torch
# # import torchvision
# # from gensim.models import Word2Vec
# import nltk
# from nltk.corpus import stopwords
# from nltk.stem import PorterStemmer, WordNetLemmatizer
# from nltk.tokenize import TreebankWordTokenizer, sent_tokenize, word_tokenize
# from nltk.tokenize.toktok import ToktokTokenizer
# # from pdfminer3.converter import PDFPageAggregator, TextConverter
# # from pdfminer3.layout import LAParams, LTTextBox
# # from pdfminer3.pdfinterp import PDFPageInterpreter, PDFResourceManager
# # from pdfminer3.pdfpage import PDFPage
# from sklearn.feature_extraction.text import CountVectorizer
# from spacy.lang.en import English
# from transformers import pipeline

# sys.path.append(r"..")

# from nlp_functions import (classifier, remove_colons, remove_digits, remove_n,
#                            remove_redundant_whitespaces,
#                            remove_strange_characters, remove_stripes,
#                            text_loader)

# # nltk.download('punkt')

# from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

In [3]:
def pdf_loader(company_name): 
    """Nimmt PDF-File Namen entgegen. Gibt Text als String, PDF-Namen, Pfad inkl. PDF-Namen sowie Pfad ohne PDF-Namen zurück"""
    company_name = company_name[:-4]
    source = r'..\Data\Nachhaltigkeitsberichte\Alle'
    path = rf"{source}\{company_name}.pdf"

    text = text_loader(path)   

    return text, company_name, path, source 

def topic_modeler(text):
    """Errechnet die Klassifizierungs-Wahrscheinlichkeiten für den Text für die vorgegebenen Label aus"""

    classifier_pipeline = pipeline(
        "zero-shot-classification", model="facebook/bart-large-mnli")
    input_sequence = text
    label_candidate = ['sustainability', 'human rights',
                       'fraud', 'social issues', 'labour law']
    x = classifier_pipeline(input_sequence, label_candidate)
    result_label = x["labels"]
    result_score = x["scores"]
    tuple_for_df = list(zip(result_label, result_score))
    df_topic_modeling_score = pd.DataFrame(
        tuple_for_df, columns=["Label", "Score"])
    return df_topic_modeling_score

def preprocessing_text(text):
    # mapping = str.maketrans('', '', string.digits)
    # text = text.translate(mapping)

    text = remove_strange_characters(text)
    text = re.sub(r'\d+', '', text)

    text = remove_n(text)

    text = remove_colons(text)

    text = text.replace(r" .", ".")
    text = text.replace(r"..", ".")
    text = text.replace(r"...", ".")

    text = remove_stripes(text)

    text = remove_redundant_whitespaces(text)
    return text



TI-IDF Functions

In [4]:
def preprocessing(text):
    """Wendet verschiedene Standard-Preprocessings auf den Text an."""


    text = remove_strange_characters(text)
    text = re.sub(r'\d+', '', text)


    text = remove_n(text)

    text = remove_colons(text)

    text = text.replace(r" .",".")
    text = text.replace(r"..",".")
    text = text.replace(r"...",".")

    text = remove_stripes(text)

    text = remove_redundant_whitespaces(text)
    return text


nltk.download('omw-1.4')
nltk.download('wordnet')

def lemmatize_words(text):
    """Formt Worte im Text in ihre Lemma um"""

    lemmatizer = WordNetLemmatizer()
    text = word_tokenize(text)

    lemma_list = []

    for word in text:
        lemma_word = lemmatizer.lemmatize(word)
        lemma_list.append(lemma_word)

    lemma_text = ' '.join(lemma_list)


    return lemma_text



def remove_small_tokens(prep_text):
    """Entfernt Tokens, welche kleiner als Vier sind."""
    prep_text_before = prep_text
    # print(f"Text before cleaning:  {len(prep_text_before)}")
    # print(type(prep_text_before))
    # prep_text_before = [prep_text_before]
    prep_text_before = word_tokenize(prep_text_before)
    # print(prep_text_before)

    for word in prep_text_before:
        if len(word) <= 3:
            prep_text_before.remove(word)
    # print(f"Text after cleaning:  {len(prep_text_before)}")
    prep_text_list = ' '.join(prep_text_before)
    # print(type(prep_text_list))


    return prep_text_list

# BOW

def get_top_n_words(corpus, n=None):
    """Errechnet die N-Meistgenannten Worte"""
    corpus=[corpus]

    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    top_n_words_df = pd.DataFrame(words_freq, columns =['Word', 'Count'])

    return top_n_words_df.loc[:(n-1)]


def get_tf_idf(text, n=None):
    """Errechnet den TF-IDF Score für die N-höchsten Scores"""
    text = re.findall(r'(?:\d[.]|[^.])*(?:[.]|$)', text)
    tfIdfVectorizer=TfidfVectorizer(use_idf=True)
    tfIdf = tfIdfVectorizer.fit_transform(text)
    df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
    df = df.sort_values('TF-IDF', ascending=False)
    return (df[:(n)])



[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
list_of_companies = os.listdir(r"..\Data\Nachhaltigkeitsberichte\Alle")

['4f303cec-a12d-480b-accb-7b56f706f60e_axa-ri2020-en-accessible.pdf',
 '4f391131-ad12-ab53-7265-5e6c88840627.pdf',
 '5ZmOsI2P3oe0plCvOThrCySgcDcKCXqj.pdf',
 '9clin.pdf',
 '10_BCGB20_SustainabilityReport_E_Web.pdf',
 '668a8a26-d924-21aa-cd75-c12e5296ff2b.pdf',
 '1212.pdf',
 '1313.pdf',
 '2012-CSR-report.pdf',
 '2017-Glencore-Sustainability-Report-FINAL-.pdf',
 '2019_Straumann_sustainability_report.pdf',
 '2019-sustainability-report-doc-en.pdf',
 '2020_dnf_-_eng_0.pdf',
 '2020_Sustainability_Report.pdf',
 '2020_valora_geschaeftsbericht_de.pdf',
 '2020-Annual-Report-7u42lsu22.pdf',
 '2020-responsibility-highlights-report.pdf',
 '2020-sustainability-report-doc-en.pdf',
 '2021-03-22_JuliusBaer_CorporateSustainabilityReport2020_EN.pdf',
 '210928_UBP20Sustainability20Report.pdf',
 '2641133_DOWNLOAD.pdf',
 '20200625_man-es_pr_cr-report_2020_en.pdf',
 'abb-group-sustainability-performance-report-2015.pdf',
 'Allianz_Group_Sustainability_Report_2020-web (1).pdf',
 'Allianz_Group_Sustainability_R

In [6]:
for company in list_of_companies:
    ## BOW / TF-IDF
    try:
        print(f"We process now {company}")

        text_1, company_name, path, source = pdf_loader(company)
        prep_text = preprocessing(text_1)
        lemma_words = lemmatize_words(prep_text)
        text_cleaned = remove_small_tokens(lemma_words)
        top_n_words = get_top_n_words(text_cleaned, 20)
        top_n_words = list(top_n_words.itertuples(index=False, name=None))
        top_n_words
        tf_idf = get_tf_idf(text_cleaned, 20)
        tf_idf = tf_idf.reset_index()
        tf_idf = tf_idf.rename(columns={"index": "Word"})
        idf_list = list(tf_idf.itertuples(index=False, name=None))
        idf_list
        data = [(company_name, idf_list, top_n_words)]
        df1 = pd.DataFrame(data, columns = ["Company Name", "TF-IDF", "Top N Words"])
        company_name_new = f"{df1['Top N Words'][0][0][0]}_{df1['Top N Words'][0][1][0]}_{df1['Top N Words'][0][2][0]}"

        ## Full Text Topic Modeling

        text = pdf_loader(company)
        text = preprocessing_text(text)
        result_df = topic_modeler(text)
        result_df.to_csv(
            fr"..\Data\Resultate\Testfolder\Zero Shot Learning Fulltext\{company_name_new}_FullText_TopicModeling.csv")

        print(f"{company_name_new} SAFED")
    except:
        print(f"{company} didn't work" )
        continue

We process now 4f303cec-a12d-480b-accb-7b56f706f60e_axa-ri2020-en-accessible.pdf
axa_health_customer SAFED
We process now 4f391131-ad12-ab53-7265-5e6c88840627.pdf
global_safety_training SAFED
We process now 5ZmOsI2P3oe0plCvOThrCySgcDcKCXqj.pdf
die_lindt_sprüngli SAFED
We process now 9clin.pdf
chemical_mitsubishi_employee SAFED
We process now 10_BCGB20_SustainabilityReport_E_Web.pdf
cid_burckhardt_compression SAFED
We process now 668a8a26-d924-21aa-cd75-c12e5296ff2b.pdf
bcge_financial_board SAFED
We process now 1212.pdf
bell_food_group SAFED
We process now 1313.pdf
bell_food_group SAFED
We process now 2012-CSR-report.pdf
dsv_employee_target SAFED
We process now 2017-Glencore-Sustainability-Report-FINAL-.pdf
community_glencore_report SAFED
We process now 2019_Straumann_sustainability_report.pdf
dental_report_production SAFED
We process now 2019-sustainability-report-doc-en.pdf
swiss_risk_sustainability SAFED
We process now 2020_dnf_-_eng_0.pdf
group_autogrill_management SAFED
We process 