In [2]:
import itertools
from nlp_functions import text_loader, classifier, remove_strange_characters, remove_n, remove_colons, remove_stripes, remove_redundant_whitespaces, remove_digits
import torchvision
import torch
import plotly.express as px
from transformers import pipeline
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import matplotlib
from sklearn.manifold import TSNE
import gensim.downloader as api
from spacy.lang.en import English
import pandas as pd
from gensim.models import Word2Vec
import os
import shutil
import io
from pdfminer3.converter import TextConverter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfpage import PDFPage
from pdfminer3.layout import LAParams, LTTextBox
import contractions
import string
import numpy as np
import spacy
import unicodedata
import re

import collections

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer


def pdf_loader(company_name):
    company_name = company_name[:-4]
    source = 'G:\Meine Ablage\HSLU\Master\Master Thesis\Coding Area\Data\Sustainability Reports\To Do'
    path = rf"{source}\{company_name}.pdf"

    text = text_loader(path)

    return text, company_name, path, source


def preprocessing_df(text):
    text = re.sub(r'\d+', '', text)

    sentences = []
    for s in sent_tokenize(text):
        sentences.append(s)

    df = pd.DataFrame(sentences, columns=['Sentences'])

    df['Alt_Text'] = df['Sentences'].apply(
        lambda x: remove_strange_characters(x))
    df['Alt_Text'] = df['Alt_Text'].apply(lambda x: remove_n(x))
    df['Alt_Text'] = df['Alt_Text'].apply(lambda x: remove_stripes(x))
    df['Alt_Text'] = df['Alt_Text'].apply(
        lambda x: remove_redundant_whitespaces(x))

    return df


def classifier_pipeline(dataframe):
    token_ls = dataframe["Alt_Text"].tolist()
    threshold = 0.5
    classification_ls = []
    short_ls = token_ls[:]
    print(len(short_ls))
    for i in short_ls:  # removing undedected empty vectors
        if len(i) < 1:
            short_ls.remove(i)
    print(len(short_ls))

    for i in short_ls:
        # token_str = ' '.join(i)
        x = classifier(i)
        if x.get("scores")[0] > threshold:
            classification_ls.append(x.get("labels")[0])
        else:
            classification_ls.append("other")
        classification_ls

    return classification_ls, short_ls


def rebuilding(short_ls, classification_ls):
    sustainability_text = []
    human_rights_text = []
    fraud_text = []
    social_issue_text = []
    employee_affairs_text = []
    other_text = []

    for (text, label) in zip(short_ls, classification_ls):
        if label == "sustainability":
            sustainability_text.append(text)
        elif label == "human rights":
            human_rights_text.append(text)
        elif label == "fraud":
            fraud_text.append(text)
        elif label == "social issues":
            social_issue_text.append(text)
        elif label == "labour law":
            employee_affairs_text.append(text)
        elif label == "other":
            other_text.append(text)
        else:
            continue

    nested_label_list = [sustainability_text, human_rights_text,
                         fraud_text, social_issue_text, employee_affairs_text, other_text]

    return nested_label_list


def list_to_string_join(input_list):
    string = []
    for item in input_list:
        string.append(item)
    joined_string = ' '.join(string)

    return joined_string


def labeling_nested_list(input_list):
    nested_label_list = input_list
    label_string = []
    for label in nested_label_list:
        label_string.append(list_to_string_join(label))

    label_candidate = ['sustainability', 'human rights',
                       'fraud', 'social issues', 'labour law', "others"]
    tuples_prep = list(zip(label_candidate, label_string))
    report_df = pd.DataFrame(tuples_prep, columns=["Label", "Text"])
    return report_df


def similarity_function(text_1, text_2):
    sentences = [text_1, text_2]

    # !pip install sentence_transformers
    from sentence_transformers import SentenceTransformer
    from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
    model = SentenceTransformer('all-MiniLM-L6-v2')
    sentence_embeddings = model.encode(sentences)
    sentence_embeddings.shape
    return cosine_similarity(
        [sentence_embeddings[0]],
        sentence_embeddings[1:]
    )


def scoring(report_df):
    wiki_text = pd.read_csv(
        r"G:\Meine Ablage\HSLU\Master\Master Thesis\Coding Area\Data\wiki_artikel_fulltext.csv")
    with open('G:\Meine Ablage\HSLU\Master\Master Thesis\Coding Area\Data\Human Rights.txt', 'r') as file:
        human_right_text = file.read()
    wiki_text.loc[1:1, "Text"] = human_right_text
    result_cosine_similarity = []
    wiki_text_list = wiki_text["Text"].to_list()
    wiki_labels_list = wiki_text["Label"].to_list()
    report_text_list = report_df["Text"].to_list()

    for w_text, r_text in zip(wiki_text_list, report_text_list):
        pre_result = (similarity_function(w_text, r_text))
        result_cosine_similarity.append(pre_result[0][0])

    data_tuples = list(zip(wiki_labels_list, result_cosine_similarity))
    result_df = pd.DataFrame(data_tuples, columns=['Label', 'Score'])
    return result_df


def preprocessing_text(text):
    # mapping = str.maketrans('', '', string.digits)
    # text = text.translate(mapping)

    text = remove_strange_characters(text)
    text = re.sub(r'\d+', '', text)

    text = remove_n(text)

    text = remove_colons(text)

    text = text.replace(r" .", ".")
    text = text.replace(r"..", ".")
    text = text.replace(r"...", ".")

    text = remove_stripes(text)

    text = remove_redundant_whitespaces(text)
    return text


def topic_modeler(text):

    classifier_pipeline = pipeline(
        "zero-shot-classification", model="facebook/bart-large-mnli")
    input_sequence = text
    label_candidate = ['sustainability', 'human rights',
                       'fraud', 'social issues', 'labour law']
    x = classifier_pipeline(input_sequence, label_candidate)
    result_label = x["labels"]
    result_score = x["scores"]
    tuple_for_df = list(zip(result_label, result_score))
    df_topic_modeling_score = pd.DataFrame(
        tuple_for_df, columns=["Label", "Score"])
    return df_topic_modeling_score


def similarity_function(text_1, text_2):
    sentences = [text_1, text_2]

    # !pip install sentence_transformers
    from sentence_transformers import SentenceTransformer
    from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
    model = SentenceTransformer('all-MiniLM-L6-v2')
    sentence_embeddings = model.encode(sentences)
    sentence_embeddings.shape
    return cosine_similarity(
        [sentence_embeddings[0]],
        sentence_embeddings[1:]
    )


def results_for_each_topic(text):
    wiki_text = pd.read_csv(
        r"G:\Meine Ablage\HSLU\Master\Master Thesis\Coding Area\Data\wiki_artikel_fulltext.csv")
    with open('G:\Meine Ablage\HSLU\Master\Master Thesis\Coding Area\Data\Human Rights.txt', 'r') as file:
        human_right_text = file.read()
    wiki_text.loc[1:1, "Text"] = human_right_text

    result_cosine_similarity = []
    labels = wiki_text["Label"].to_list()
    labels
    for label_text in wiki_text["Text"]:
        pre_result = (similarity_function(label_text, text))
        result_cosine_similarity.append(pre_result[0][0])

    data_tuples = list(zip(labels, result_cosine_similarity))
    result_df = pd.DataFrame(data_tuples, columns=['Label', 'Score'])
    return result_df



 





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


TI-IDF Functions

In [3]:
def preprocessing(text):
    # mapping = str.maketrans('', '', string.digits)
    # text = text.translate(mapping)

    text = remove_strange_characters(text)
    text = re.sub(r'\d+', '', text)


    text = remove_n(text)

    text = remove_colons(text)

    text = text.replace(r" .",".")
    text = text.replace(r"..",".")
    text = text.replace(r"...",".")

    text = remove_stripes(text)

    text = remove_redundant_whitespaces(text)
    return text


nltk.download('omw-1.4')
nltk.download('wordnet')

def lemmatize_words(text):

    lemmatizer = WordNetLemmatizer()
    text = word_tokenize(text)

    lemma_list = []

    for word in text:
        lemma_word = lemmatizer.lemmatize(word)
        lemma_list.append(lemma_word)

    lemma_text = ' '.join(lemma_list)


    return lemma_text



def remove_small_tokens(prep_text):
    prep_text_before = prep_text
    # print(f"Text before cleaning:  {len(prep_text_before)}")
    # print(type(prep_text_before))
    # prep_text_before = [prep_text_before]
    prep_text_before = word_tokenize(prep_text_before)
    # print(prep_text_before)

    for word in prep_text_before:
        if len(word) <= 3:
            prep_text_before.remove(word)
    # print(f"Text after cleaning:  {len(prep_text_before)}")
    prep_text_list = ' '.join(prep_text_before)
    # print(type(prep_text_list))


    return prep_text_list

# BOW

def get_top_n_words(corpus, n=None):
    corpus=[corpus]

    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    top_n_words_df = pd.DataFrame(words_freq, columns =['Word', 'Count'])

    return top_n_words_df.loc[:(n-1)]


def get_tf_idf(text, n=None):
    text = re.findall(r'(?:\d[.]|[^.])*(?:[.]|$)', text)
    tfIdfVectorizer=TfidfVectorizer(use_idf=True)
    tfIdf = tfIdfVectorizer.fit_transform(text)
    df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
    df = df.sort_values('TF-IDF', ascending=False)
    return (df[:(n)])



[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
131699/1024*10/60

21.435384114583332

In [None]:
text_1, company_name, path, source = pdf_loader("annual-and-sustainability-report-2020.pdf")

In [12]:
text_2 = preprocessing_text(text_1)
text_3 = text_2.split(" ")

In [32]:
text_4 = text_3
text_schnipsel_sammler = []
while len(text_4) > 700:
    text_schnipsel = text_4[:700]
    text_schnipsel_sammler.append(text_schnipsel)
    text_4 = text_4[700:]
else:
    text_schnipsel = text_4[:]
    text_schnipsel_sammler.append(text_schnipsel)


In [33]:
end_schnitzel = []
for schnipsel in text_schnipsel_sammler:
    schnipsel_zusammen = ' '.join(schnipsel)
    end_schnitzel.append(schnipsel_zusammen)


In [34]:
schnitzel_summary = []
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

for schnitzel in end_schnitzel:
    summary = summarizer(schnitzel, max_length=128, min_length=30, do_sample=False)
    schnitzel_summary.append(summary)

Token indices sequence length is longer than the specified maximum sequence length for this model (1053 > 1024). Running this sequence through the model will result in indexing errors


IndexError: index out of range in self

In [35]:
def results_for_each_topic(text):
    wiki_text = pd.read_csv(
        r"G:\Meine Ablage\HSLU\Master\Master Thesis\Coding Area\Data\wiki_artikel_summary.csv", index_col=0)
    wiki_text_list = wiki_text["Text"].tolist()


    from transformers import pipeline
    wiki_summarized_list = []
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

    for label in wiki_text_list:
        label = label[0:1023]
        label_summarized = (summarizer(label, max_length=130, min_length=30, do_sample=False))
        wiki_summarized_list.append(label_summarized)


    wiki_summarized_list_final = []
    for element in wiki_summarized_list:
        new_element = element[0]["summary_text"]
        wiki_summarized_list_final.append(new_element)


    wiki_text["Summarized"] = wiki_summarized_list_final

    result_cosine_similarity = []
    labels = wiki_text["Label"].to_list()
    labels
    for label_text in wiki_text["Summarized"]:
        pre_result = (similarity_function(label_text, text))
        result_cosine_similarity.append(pre_result[0][0])

    data_tuples = list(zip(labels, result_cosine_similarity))
    result_df = pd.DataFrame(data_tuples, columns=['Label', 'Score'])
    return result_df

In [36]:
schnitzel_ergebnisse = []
for gesummeds_schnitzel in schnitzel_summary:
    schnitzel_ergebnisse.append(results_for_each_topic(gesummeds_schnitzel))

KeyboardInterrupt: 

In [None]:
wiki_text = pd.read_csv(
    r"G:\Meine Ablage\HSLU\Master\Master Thesis\Coding Area\Data\wiki_artikel_summary.csv", index_col=0)
wiki_text_list = wiki_text["Text"].tolist()


from transformers import pipeline
wiki_summarized_list = []
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

for label in wiki_text_list:
    label = label[0:1023]
    label_summarized = (summarizer(label, max_length=130, min_length=30, do_sample=False))
    wiki_summarized_list.append(label_summarized)


wiki_summarized_list_final = []
for element in wiki_summarized_list:
    new_element = element[0]["summary_text"]
    wiki_summarized_list_final.append(new_element)


wiki_text["Summarized"] = wiki_summarized_list_final

In [9]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
summary = summarizer(text_2, max_length=128, min_length=30, do_sample=False)
print(summary)

'VOLVO GROUP ANNUAL AND SUSTAINABILIT Y REPORT SHAPING THE FUTURE OF TRANS PORTATION AND INFRASTRUCTURE A GLOBAL GROUP OVERVIEW THIS IS THE VOLVO GROUP OUR CUSTOMERS MAKE SOCIETIES WORK The Volvo Groups mission is to drive prosperity through transport and infra structure solutions. We continuously develop our products and services to increase the value for our customers, to support sustainable societies and to promote the wellbeing and safety of people. Driving prosperity socially, environmentally and financially means striving for transport and infrastructure solutions that are % safe, % fossilfree and % more productive. The Volvo Groups products and services contribute to much of what we all expect of a well functioning and prosperous society, since our trucks, buses, construction equipment, power solutions for marine and industrial applications, financing and services are involved in many activities that most of us rely on every day. The majority of'

In [None]:
wiki_text = pd.read_csv(
    r"G:\Meine Ablage\HSLU\Master\Master Thesis\Coding Area\Data\wiki_artikel_fulltext.csv")




In [None]:
def results_for_each_topic(text):
    wiki_text = pd.read_csv(
        r"G:\Meine Ablage\HSLU\Master\Master Thesis\Coding Area\Data\wiki_artikel_fulltext.csv")
    with open('G:\Meine Ablage\HSLU\Master\Master Thesis\Coding Area\Data\Human Rights.txt', 'r') as file:
        human_right_text = file.read()
    wiki_text.loc[1:1, "Text"] = human_right_text

    result_cosine_similarity = []
    labels = wiki_text["Label"].to_list()
    labels
    for label_text in wiki_text["Text"]:
        pre_result = (similarity_function(label_text, text))
        result_cosine_similarity.append(pre_result[0][0])

    data_tuples = list(zip(labels, result_cosine_similarity))
    result_df = pd.DataFrame(data_tuples, columns=['Label', 'Score'])
    return result_df

In [None]:
result_df = results_for_each_topic(text__128)


In [6]:

list_of_companies = os.listdir(r"G:\Meine Ablage\HSLU\Master\Master Thesis\Coding Area\Data\Sustainability Reports\To Do")
one_company = list_of_companies[0:1]

for company in one_company:
    try:
        ## BOW / TF-IDF
        print(f"We process now {company}")

        text_1, company_name, path, source = pdf_loader(company)
        prep_text = preprocessing(text_1)
        lemma_words = lemmatize_words(prep_text)
        text_cleaned = remove_small_tokens(lemma_words)
        top_n_words = get_top_n_words(text_cleaned, 20)
        top_n_words = list(top_n_words.itertuples(index=False, name=None))
        top_n_words
        tf_idf = get_tf_idf(text_cleaned, 20)
        tf_idf = tf_idf.reset_index()
        tf_idf = tf_idf.rename(columns={"index": "Word"})
        idf_list = list(tf_idf.itertuples(index=False, name=None))
        idf_list
        data = [(company_name, idf_list, top_n_words)]
        df1 = pd.DataFrame(data, columns = ["Company Name", "TF-IDF", "Top N Words"])
        # print(df1)
        company_name_new = f"{df1['Top N Words'][0][0][0]}_{df1['Top N Words'][0][1][0]}_{df1['Top N Words'][0][2][0]}"
        print(company_name_new)
        # df1.to_csv(
            # fr"G:\Meine Ablage\HSLU\Master\Master Thesis\Coding Area\Data\Results\Human Rights Alternative\{company_name_new}_bow_tf_ifd.csv")

        ## Full Text Topic Modeling

        # company_name = company
        # text = pdf_loader(company)
        # text = preprocessing_text(text)
        # result_df = topic_modeler(text)
        # result_df.to_csv(
        #     fr"G:\Meine Ablage\HSLU\Master\Master Thesis\Coding Area\Data\Results\Human Rights Alternative\{company_name_new}_FullText_TopicModeling.csv")

        ## Full Text Similarity Scoring

        text_1, company_name, path, source = pdf_loader(company)
        text_2 = preprocessing_text(text_1)
        result_df = results_for_each_topic(text_2)
        result_df.to_csv(
            fr"G:\Meine Ablage\HSLU\Master\Master Thesis\Coding Area\Data\Results Bulldozer\Testing\{company_name_new}_FullText_Cosine_Scoring all-MiniLM-L6-v2_all.csv")


        ## Paragraphed Similarity Scoring

        # text_1, company_name, path, source = pdf_loader(company)
        # df_1 = preprocessing_df(text_1)
        # classification_ls, short_ls = classifier_pipeline(df_1)
        # nested_label_list = rebuilding(classification_ls, short_ls)
        # report_df = labeling_nested_list(nested_label_list)
        # results_df = scoring(report_df)
        # results_df.to_csv(
        #     fr"G:\Meine Ablage\HSLU\Master\Master Thesis\Coding Area\Data\Results\{company_name_new}_Paragraphed_Cosine_Scoring.csv")
        # shutil.move(
        #     fr"{path}", fr"G:\Meine Ablage\HSLU\Master\Master Thesis\Coding Area\Data\Sustainability Reports\Already Done\{company_name}.pdf")
        print(f"{company_name_new} SAFED")
    except:
        # shutil.move(
        #     fr"G:\Meine Ablage\HSLU\Master\Master Thesis\Coding Area\Data\Sustainability Reports\To Do\{company}", fr"G:\Meine Ablage\HSLU\Master\Master Thesis\Coding Area\Data\Sustainability Reports\Not Working\{company}")
        print(f"{company} didn't work")

We process now annual-and-sustainability-report-2020.pdf
volvo_group_financial
volvo_group_financial SAFED
