In [None]:
import pickle
import pandas as pd
import gensim
from gensim.parsing.preprocessing import (stem_text)
import pickle
import pandas as pd
import re
import string
import os


In [None]:
df = pd.read_pickle("../data/data_scraped_input.pkl")


print(df.head())

In [None]:
print(df.iloc[13,5])

Since we want to classify paragraphs rather than whole Contracts we need to split the contracts in smaller chunks.

In [None]:
def extract_paragraphs_and_sections(row):
    import re

    text = row['content']
    contract_id = row['contract']
    lines = text.splitlines()
    paragraphs = []
    current_para_lines = []
    current_para_number = None
    current_para_match = None

    # 1. extract paragraphs

    for line in lines:
        line = line.strip()
        if not line:
            continue

        match_main = re.match(r'(§\s*\d+|^\d+\.)\s+', line)

        if match_main:
            if current_para_lines:
                paragraphs.append((current_para_number, ' '.join(current_para_lines), current_para_match))
            current_para_number = match_main.group(1).strip().lstrip('§').rstrip('.')  # e.g. "14"
            current_para_match = match_main.group(0).strip()
            current_para_lines = [line]
        elif current_para_lines:
            current_para_lines.append(line)

    if current_para_lines:
        paragraphs.append((current_para_number, ' '.join(current_para_lines), current_para_match))

    rows = []
    seen_sections = set()  # (contract_id, para_num, section_id)

    for para_num, para_text, para_match in paragraphs:
        print(seen_sections)
        print(para_text)
        matches = list(re.finditer(rf'(?:(?<=\s)|(?<=^))({para_num}\.\d+)(?:\.|\b)(?=\s)', para_text))
        print(matches)

        if not matches:
            rows.append({
                'contract': contract_id,
                'paragraph': para_match,
                'paragraph_content': para_text.strip(),
                'section': "no sections use paragraph",
                'section_content': para_text.strip()
            })
            continue

        positions = []
        for match in matches:
            section_id = match.group(1)
            start = match.start()
            print(f'match: {match} section_id: {section_id}, start_{start}')

            # Skip if this section was already processed for this contract and paragraph
            section_key = (contract_id, para_num, section_id)
            if section_key in seen_sections:
                print(f'match: {match} exisitert beretis überspringen...')
                continue

            seen_sections.add(section_key)
            positions.append((start, section_id))
            print('postion added')

        # Add end position
        positions.append((len(para_text), None))
        positions = sorted(positions)
        print(f'positions = {positions}')
        print('###########')

        for i in range(len(positions) - 1):
            start_pos = positions[i][0]
            end_pos = positions[i + 1][0]
            section_id = positions[i][1]
            section_text = para_text[start_pos:end_pos].strip()

            rows.append({
                'contract': contract_id,
                'paragraph': para_match,
                'paragraph_content': para_text.strip(),
                'section': section_id,
                'section_content': section_text
            })

    return rows






def clean_paragraph_text(text):
    # 1. remove paragraph marker z. B. '§ 1' oder '1.'
    text = re.sub(r'^(§\s*\d+|\d+\.)\s+', '', text)
    text = re.sub(r'(?<!\d)(\d+\.\d+)(?=\s)', '',text)

    # 2. remove punctation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # 3. remove double whitespaces
    text = re.sub(r'\s+', ' ', text)

    # 4. remove whitespace beginning and end
    return text.strip()




In [None]:
# filter df to relevant contracts
df_relevant = df[(df['Kategorie'] == "kleinere SaaS-Anbieter (Hauptgruppe)") & (df['Sprache'] == "DE") & (df['Quelle/Organisation'] != "Comarch ERP XT"	)]
df_relevant = df_relevant.iloc[:,[5]]
df_relevant.columns = ['content']
df_relevant["contract"] = range(1, df_relevant.shape[0] + 1)
df_relevant = df_relevant[['contract', 'content']]
df_relevant.head()

In [None]:
# apply prargrah and section extractor
df_exploded = df_relevant.apply(extract_paragraphs_and_sections, axis=1)

print(df_exploded.head())
# flatten the lists to a df
from itertools import chain
flattened_rows = list(chain.from_iterable(df_exploded))

# build df
df_structured = pd.DataFrame(flattened_rows)

In [None]:
def extract_title_fixed(group):
    import re
    paragraph_text = group['paragraph_content'].iloc[0]
    section_texts = group['section_content'].tolist()

    # No Sections (single paragraph)
    if len(section_texts) == 1 and group['section'].iloc[0] == "no sections use paragraph":
        # find sentence end
        match = re.search(r'\b(Der|Die|Das|Es|Ein|Eine)\s+[A-ZÄÖÜ][a-zäöü]+\b', paragraph_text)
        if match:
            title = paragraph_text[:match.start()].strip()
        else:
            # Fallback: to first verb or 8 words
            title = ' '.join(paragraph_text.split()[:8])
        return pd.Series([title] * len(group), index=group.index)

    # secction split
    for section in section_texts:
        paragraph_text = paragraph_text.replace(section, '')
    title = paragraph_text.strip()
    return pd.Series([title] * len(group), index=group.index)


df_structured['paragraph_title'] = df_structured.groupby(['contract', 'paragraph'], group_keys= False).apply(extract_title_fixed)
# select cols

df_structured = df_structured[
    ['contract', 'paragraph', 'paragraph_title', 'paragraph_content', 'section', 'section_content']
]

# remove title
df_structured['paragraph_title'] = df_structured.apply(
    lambda row: row['paragraph_title'].replace(row['paragraph'], '').strip() if pd.notnull(row['paragraph_title']) else '',
    axis=1
)


df_structured["clean_paragraph_content"] = df_structured["paragraph_content"].apply(clean_paragraph_text)
df_structured["clean_section_content"] = df_structured["section_content"].apply(clean_paragraph_text)


df_structured["clean_paragraph_content"] = df_structured.apply(
    lambda row: row["clean_paragraph_content"].replace(row['paragraph_title'], '').strip() if pd.notnull(row["clean_paragraph_content"]) else '',
    axis=1
)

df_structured["clean_section_content"] = df_structured.apply(
    lambda row: row["clean_section_content"].replace(row['paragraph_title'], '').strip() if pd.notnull(row["clean_section_content"]) else '',
    axis=1
)

print(df_structured.shape)

In [None]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
from transformers import BertTokenizer
bert_uncased = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
df_structured["paragraph_content_stemm"]=df_structured['clean_paragraph_content'].apply(
    lambda text: stem_text(text)
)
df_structured["paragraph_content_lemma"]=df_structured['clean_paragraph_content'].apply(
    lambda text: " ".join([token.lemma_ for token in nlp(text) if not token.is_space])
)
df_structured["paragraph_content_token"]=df_structured['clean_paragraph_content'].apply(
    lambda text: bert_uncased.tokenize(text)
)

In [None]:
df_structured["section_content_stemm"]=df_structured['clean_section_content'].apply(
    lambda text: stem_text(text)
)
df_structured["paragraph_section_lemma"]=df_structured['clean_section_content'].apply(
    lambda text: " ".join([token.lemma_ for token in nlp(text) if not token.is_space])
)
df_structured["paragraph_section_token"]=df_structured['clean_section_content'].apply(
    lambda text: bert_uncased.tokenize(text)
)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
topic_text = " ".join(df_structured["paragraph_content_stemm"].dropna())
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(topic_text)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title(f"Wordcloud – stemm", fontsize=16)
plt.show()

In [None]:
topic_text = " ".join(df_structured["paragraph_content_lemma"].dropna())
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(topic_text)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title(f"Wordcloud – lemma", fontsize=16)
plt.show()

In [None]:
topic_text = " ".join(
    word for tokens in df_structured["paragraph_content_token"].dropna() for word in tokens
)

wordcloud = WordCloud(width=800, height=400, background_color="white").generate(topic_text)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title(f"Wordcloud – toke", fontsize=16)
plt.show()

In [None]:
file_path = '../data/data_clean.xlsx'
df_structured.to_excel(file_path, index=False)

file_path = '../data/data_clean.pkl'  
df_structured.to_pickle(file_path)
