# Full Text Analysis with BERT

### Requirements

In [3]:
import collections
import io
import itertools
import os
import re
import shutil
import string
import unicodedata

import contractions
import gensim.downloader as api
import matplotlib
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import plotly.express as px
import spacy
import torch
import torchvision
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer, sent_tokenize, word_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from pdfminer3.converter import PDFPageAggregator, TextConverter
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer3.pdfpage import PDFPage
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from spacy.lang.en import English
from transformers import pipeline

sys.path.append(r"..")

from nlp_functions import (classifier, remove_colons, remove_digits, remove_n,
                           remove_redundant_whitespaces,
                           remove_strange_characters, remove_stripes,
                           text_loader)

nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')

from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Preprocessing

In [4]:
def preprocessing(text):
    """Wendet verschiedene Standard-Preprocessings auf den Text an."""

    text = remove_strange_characters(text)
    text = re.sub(r'\d+', '', text)


    text = remove_n(text)

    text = remove_colons(text)

    text = text.replace(r" .",".")
    text = text.replace(r"..",".")
    text = text.replace(r"...",".")

    text = remove_stripes(text)

    text = remove_redundant_whitespaces(text)
    return text

In [5]:
def lemmatize_words(text):
    """Formt Worte im Text in ihre Lemma um"""

    lemmatizer = WordNetLemmatizer()
    text = word_tokenize(text)

    lemma_list = []

    for word in text:
        lemma_word = lemmatizer.lemmatize(word)
        lemma_list.append(lemma_word)

    lemma_text = ' '.join(lemma_list)


    return lemma_text



[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\daveb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
def remove_small_tokens(prep_text):
    """Entfernt Tokens, welche kleiner als Vier sind."""
    prep_text_before = prep_text
    prep_text_before = word_tokenize(prep_text_before)

    for word in prep_text_before:
        if len(word) <= 3:
            prep_text_before.remove(word)
    prep_text_list = ' '.join(prep_text_before)


    return prep_text_list


## Part 1: Bag of Words

In [7]:
def get_top_n_words(corpus, n=None):
    """Errechnet die N-Meistgenannten Worte (BOW)"""
    corpus=[corpus]

    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    top_n_words_df = pd.DataFrame(words_freq, columns =['Word', 'Count'])

    return top_n_words_df.loc[:(n-1)]



# Part 2: TF_IDF

In [11]:
def get_tf_idf(text, n=None):
    """Errechnet den TF-IDF Score für die N-höchsten Scores"""
    text = re.findall(r'(?:\d[.]|[^.])*(?:[.]|$)', text)
    tfIdfVectorizer=TfidfVectorizer(use_idf=True)
    tfIdf = tfIdfVectorizer.fit_transform(text)
    df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
    df = df.sort_values('TF-IDF', ascending=False)
    return (df[:(n)])

# Auto Script

In [12]:
df = pd.read_csv(r"..\Data\Zieltexte\wiki_artikel_fulltext.csv", index_col=None)


In [13]:
df

Unnamed: 0.1,Unnamed: 0,Label,Text
0,0,sustainability,Sustainability is a societal goal that broadly...
1,1,human rights,Human rights are moral principles or norms for...
2,2,fraud,"In law, fraud is intentional deception to secu..."
3,3,social issues,A social issue is a problem that affects many ...
4,4,labour law,Labour laws (also known as labor laws or emplo...


In [14]:
frames = []
topic_list = pd.Series.tolist(df["Label"])
text_list = pd.Series.tolist(df["Text"])

for topic, text in zip(topic_list, text_list):
    prep_text = preprocessing(text)
    lemma_words = lemmatize_words(prep_text)
    text_cleaned = remove_small_tokens(lemma_words)
    top_n_words = get_top_n_words(text_cleaned, 20)
    top_n_words = list(top_n_words.itertuples(index=False, name=None))
    tf_idf = get_tf_idf(text_cleaned, 20)
    tf_idf=tf_idf.reset_index()
    tf_idf = tf_idf.rename(columns={"index": "Word"})
    idf_list = list(tf_idf.itertuples(index=False, name=None))
    data = [(f"{topic}", idf_list, top_n_words)]
    df1 = pd.DataFrame(data, columns = ["Topic", "TF-IDF", "Top N Words"])
    frames.append(df1)



Text before cleaning:  1419734
<class 'str'>
Text after cleaning:  161365
<class 'str'>




Text before cleaning:  2408783
<class 'str'>
Text after cleaning:  280237
<class 'str'>




Text before cleaning:  1304877
<class 'str'>
Text after cleaning:  156662
<class 'str'>




Text before cleaning:  416889
<class 'str'>
Text after cleaning:  48684
<class 'str'>




Text before cleaning:  1372542
<class 'str'>
Text after cleaning:  160030
<class 'str'>




In [15]:
frames[0]["TF-IDF"][0]

[('sustainability', 0.29591566874854086),
 ('environmental', 0.22472979753639727),
 ('integrity', 0.21850133487134685),
 ('dimension', 0.203935251011822),
 ('planetary', 0.20071664116404878),
 ('loss', 0.1906867091204647),
 ('coexist', 0.15382737243694788),
 ('exceeding', 0.1478889550200678),
 ('most', 0.14560167679611352),
 ('over', 0.14560167679611352),
 ('time', 0.14146977279270365),
 ('safely', 0.1335808083412773),
 ('agree', 0.13114949649624702),
 ('regarded', 0.13114949649624702),
 ('accordingly', 0.12897461393960535),
 ('everyday', 0.12700719335294072),
 ('broadly', 0.1235588141144639),
 ('dominant', 0.12202905569463365),
 ('vary', 0.12060488467769495),
 ('commonly', 0.11572527843442822)]

In [16]:
frames

[            Topic                                             TF-IDF  \
 0  sustainability  [(sustainability, 0.29591566874854086), (envir...   
 
                                          Top N Words  
 0  [(sustainable, 1268), (social, 1148), (sustain...  ,
           Topic                                             TF-IDF  \
 0  human rights  [(they, 0.3156911031032052), (being, 0.2869773...   
 
                                          Top N Words  
 0  [(right, 3660), (human, 2564), (international,...  ,
    Topic                                             TF-IDF  \
 0  fraud  [(fraud, 0.38826095598392985), (perpetrator, 0...   
 
                                          Top N Words  
 0  [(fraud, 1708), (company, 582), (victim, 573),...  ,
            Topic                                             TF-IDF  \
 0  social issues  [(issue, 0.4078577368920272), (social, 0.26852...   
 
                                          Top N Words  
 0  [(social, 580), (loneliness, 275)

In [17]:
df_final = pd.concat(frames)


In [18]:
df_final

Unnamed: 0,Topic,TF-IDF,Top N Words
0,sustainability,"[(sustainability, 0.29591566874854086), (envir...","[(sustainable, 1268), (social, 1148), (sustain..."
0,human rights,"[(they, 0.3156911031032052), (being, 0.2869773...","[(right, 3660), (human, 2564), (international,..."
0,fraud,"[(fraud, 0.38826095598392985), (perpetrator, 0...","[(fraud, 1708), (company, 582), (victim, 573),..."
0,social issues,"[(issue, 0.4078577368920272), (social, 0.26852...","[(social, 580), (loneliness, 275), (people, 22..."
0,labour law,"[(stipulated, 0.27239828555675233), (empire, 0...","[(employee, 1319), (worker, 988), (employer, 9..."


In [19]:
df_final.to_csv(fr"..\Data\Resultate\Testfolder\TF-IDF Wiki\wiki_bow_tf_idf.csv")