# SetUp

I will install, download, and import all necessary packages for the tasks in the following. 

## Installations

In order to run the following codes, it might be required to install all these packes first.

In [3]:
import collections
import pdb
import re
import unicodedata

import nltk
import spacy
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer, sent_tokenize, word_tokenize
from nltk.tokenize.toktok import ToktokTokenizer

nltk.download('punkt')

import io
import itertools
import shutil
import string
import sys

import contractions
import gensim.downloader as api
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import torch
import torchvision
from gensim.models import Word2Vec
from pdfminer3.converter import PDFPageAggregator, TextConverter
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer3.pdfpage import PDFPage
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from spacy.lang.en import English
from transformers import pipeline

sys.path.append(r"..")

from nlp_functions import (classifier, remove_colons, remove_digits, remove_n,
                           remove_redundant_whitespaces,
                           remove_strange_characters, remove_stripes,
                           text_loader)

nltk.download('punkt')

from statistics import mean

from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Loading the text

In [4]:
def pdf_loader(company_name):
    """Nimmt PDF-File Namen entgegen. Gibt Text als String, PDF-Namen, Pfad inkl. PDF-Namen sowie Pfad ohne PDF-Namen zurück"""

    company_name = company_name[:-4]
    source = r'..\Data\Nachhaltigkeitsberichte\Alle'
    path = rf"{source}\{company_name}.pdf"

    text = text_loader(path)

    return text, company_name, path, source

# Part 1: Zero Shot Learning Classification

This part we will different scoring methods and try to apply them. The similarity score to a pregiven text will be the major idea atm.

### Needed Functions for the TF-IDF

In [6]:
def preprocessing(text):
    """Wendet verschiedene Standard-Preprocessings auf den Text an."""


    text = remove_strange_characters(text)
    text = re.sub(r'\d+', '', text)


    text = remove_n(text)

    text = remove_colons(text)

    text = text.replace(r" .",".")
    text = text.replace(r"..",".")
    text = text.replace(r"...",".")

    text = remove_stripes(text)

    text = remove_redundant_whitespaces(text)
    return text


nltk.download('omw-1.4')
nltk.download('wordnet')

def lemmatize_words(text):
    """Formt Worte im Text in ihre Lemma um"""

    lemmatizer = WordNetLemmatizer()
    text = word_tokenize(text)

    lemma_list = []

    for word in text:
        lemma_word = lemmatizer.lemmatize(word)
        lemma_list.append(lemma_word)

    lemma_text = ' '.join(lemma_list)


    return lemma_text



def remove_small_tokens(prep_text):
    """Entfernt Tokens, welche kleiner als Vier sind."""
    prep_text_before = prep_text
    # print(f"Text before cleaning:  {len(prep_text_before)}")
    # print(type(prep_text_before))
    # prep_text_before = [prep_text_before]
    prep_text_before = word_tokenize(prep_text_before)
    # print(prep_text_before)

    for word in prep_text_before:
        if len(word) <= 3:
            prep_text_before.remove(word)
    # print(f"Text after cleaning:  {len(prep_text_before)}")
    prep_text_list = ' '.join(prep_text_before)
    # print(type(prep_text_list))


    return prep_text_list

# BOW

def get_top_n_words(corpus, n=None):
    """Errechnet die N-Meistgenannten Worte"""
    corpus=[corpus]

    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    top_n_words_df = pd.DataFrame(words_freq, columns =['Word', 'Count'])

    return top_n_words_df.loc[:(n-1)]


def get_tf_idf(text, n=None):
    """Errechnet den TF-IDF Score für die N-höchsten Scores"""
    text = re.findall(r'(?:\d[.]|[^.])*(?:[.]|$)', text)
    tfIdfVectorizer=TfidfVectorizer(use_idf=True)
    tfIdf = tfIdfVectorizer.fit_transform(text)
    df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
    df = df.sort_values('TF-IDF', ascending=False)
    return (df[:(n)])



[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Let it all run

In [7]:
def mean_cosine_calculator(cosine_results):
    non_tensor_list = []
    for element in cosine_results:
        string_element = str(element[0])
        non_tensor_list.append(float(string_element[8:14]))

    non_tensor_list

    after_threshold_sust = []
    threshold = 0.08

    for cosine in non_tensor_list:
        if cosine > threshold:
            after_threshold_sust.append(cosine)


    mean_result = round(mean(after_threshold_sust), 3)
    return mean_result

In [8]:
list_of_companies = os.listdir(
    r"..\Data\Nachhaltigkeitsberichte\Alle")
classifier_pipeline = pipeline(
"zero-shot-classification", model="facebook/bart-large-mnli")

for company in list_of_companies:
    ## BOW / TF-IDF
    print(f"We process now {company}")
    text_1, company_name, path, source = pdf_loader(company)
    prep_text = preprocessing_tf(text_1)
    lemma_words = lemmatize_words(prep_text)
    text_cleaned = remove_small_tokens(lemma_words)
    top_n_words = get_top_n_words(text_cleaned, 20)
    top_n_words = list(top_n_words.itertuples(index=False, name=None))
    top_n_words
    tf_idf = get_tf_idf(text_cleaned, 20)
    tf_idf = tf_idf.reset_index()
    tf_idf = tf_idf.rename(columns={"index": "Word"})
    idf_list = list(tf_idf.itertuples(index=False, name=None))
    idf_list
    data = [(company_name, idf_list, top_n_words)]
    df1 = pd.DataFrame(data, columns = ["Company Name", "TF-IDF", "Top N Words"])
    # print(df1)
    company_name_new = f"{df1['Top N Words'][0][0][0]}_{df1['Top N Words'][0][1][0]}_{df1['Top N Words'][0][2][0]}"
    print(company_name_new)

    df_1 = preprocessing(text_1)
    text_list = df_1["Alt_Text"].tolist()
    new_text_list = []
    print(len(text_list))
    for element in text_list:
        if len(element) < 15:
            text_list.remove(element)
        if len(element) > 15:
            new_text_list.append(element)
    print(len(text_list))
    print(len(new_text_list))

    scores_sust = []
    scores_hr = []
    scores_social = []
    scores_fraud = []
    scores_labour = []


    classification_sust = []
    classification_hr = []
    classification_social = []
    classification_fraud = []
    classification_labour = []
    classification_other = []

    for test in new_text_list:


        input_sequence = test
        label_candidate = ['sustainability', 'human rights', 'fraud',
                            'social issues', 'labour law']
        output = classifier_pipeline(input_sequence, label_candidate)
        scores_sust.append(output["scores"][0])
        scores_hr.append(output["scores"][1])
        scores_social.append(output["scores"][2])
        scores_fraud.append(output["scores"][3])
        scores_labour.append(output["scores"][4])


        threshold = 0.45




        if output["scores"][0] > threshold:
            classification_sust.append(output["sequence"])
        elif output["scores"][1] > threshold:
            classification_hr.append(output["sequence"])
        elif output["scores"][2] > threshold:
            classification_social.append(output["sequence"])
        elif output["scores"][3] > threshold:
            classification_fraud.append(output["sequence"])
        elif output["scores"][4] > threshold:
            classification_labour.append(output["sequence"])
        else:
            classification_other.append(output["sequence"])

    classified_textes = [classification_sust, classification_hr, classification_social, classification_fraud, classification_labour]
    scores = [scores_sust, scores_hr, scores_social, scores_fraud, scores_labour]

    label_candidate = ['sustainability', 'human rights', 'fraud',
                    'social issues', 'labour law']

    data = {'Label':label_candidate,'Zero Shot Score Raw':scores, "Classified Text": classified_textes}

    final_df = pd.DataFrame(data)
    # print(df1)
    final_df.to_csv(fr"..\Data\Resultate\Testfolder\Zero Shot Learning Sentencewise raw\{company_name_new}_zero_shot_sentencewise_scores_raw.csv")
    print("SAFED")

We process now 4f303cec-a12d-480b-accb-7b56f706f60e_axa-ri2020-en-accessible.pdf
axa_health_customer
702
691
677
SAFED
We process now 4f391131-ad12-ab53-7265-5e6c88840627.pdf
global_safety_training
5
5
5
SAFED
We process now 5ZmOsI2P3oe0plCvOThrCySgcDcKCXqj.pdf
die_lindt_sprüngli
802
792
782
SAFED
We process now 9clin.pdf
chemical_mitsubishi_employee
1023
995
966
SAFED
We process now 10_BCGB20_SustainabilityReport_E_Web.pdf
cid_burckhardt_compression
270
260
250
SAFED
We process now 668a8a26-d924-21aa-cd75-c12e5296ff2b.pdf
bcge_financial_board
2169
1923
1676
SAFED
We process now 1212.pdf
bell_food_group
719
705
690
SAFED
We process now 1313.pdf
bell_food_group
719
705
690
SAFED
We process now 2012-CSR-report.pdf
dsv_employee_target
430
379
327
SAFED
We process now 2017-Glencore-Sustainability-Report-FINAL-.pdf
community_glencore_report
2083
1968
1847
SAFED
We process now 2019_Straumann_sustainability_report.pdf
dental_report_production
400
366
330
SAFED
We process now 2019-sustainabili