In [1]:
import warnings
from importlib import reload

warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import sys
from tqdm.auto import tqdm, trange
from bs4 import BeautifulSoup
import os

# Load the dataset
df = pd.read_csv('concurrent_requisition_results.csv')

In [4]:
df.shape

(124449, 14)

In [5]:
df.columns

Index(['request_id', 'total', 'error', 'subject', 'body', 'plain_body',
       'mailbox', 'mail', 'host', 'full', 'date', 'uid', 'messageId',
       'is_attachments_exists'],
      dtype='object')

In [6]:
df_unique = df.drop_duplicates(subset='messageId', keep='first')


In [7]:
df_unique.shape

(560, 14)

## Split HTML bodies into messages

In [None]:
import pandas as pd
import sys
from tqdm.auto import tqdm, trange
from bs4 import BeautifulSoup
from html_messages_parser import get_element_messages
import os

# Load the dataset
df = pd.read_csv('concurrent_requisition_results.csv')

# Filter rows where body is not null
df_filtered = df[df['body'].notna()]

# Set up the CSV file
output_file = 'classified_messages.csv'
columns = [
    'request_id', 'total', 'error', 'subject', 'mailbox', 'mail', 'host', 
    'full', 'date', 'uid', 'messageId', 'is_attachments_exists', 
    'message_index', 'message_text', 'is_offer', 'is_system_message'
]

# Check if the file exists and is non-empty
file_exists = os.path.exists(output_file)
write_header = not file_exists or os.path.getsize(output_file) == 0

# Write the header if the file is empty or doesn't exist
if write_header:
    pd.DataFrame(columns=columns).to_csv(output_file, index=False)

# Initialize counters for successes and failures
success_count = 0
error_count = 0

# Create progress bars
pbar_total = tqdm(total=len(df_filtered), desc="Total Processing", position=0, leave=True)
pbar_errors = tqdm(total=0, desc="Failed Handlings", position=1, leave=True, bar_format='{l_bar}{bar}| {n} failed')

# Process each row
for _, row in df_filtered.iterrows():
    try:
        soup = BeautifulSoup(row['body'], "html.parser")
        root_element = soup.find('body') if soup.find('body') else soup
        messages = get_element_messages(root_element)

        # Loop through messages from latest to oldest
        for index, message in enumerate(reversed(messages), start=1):
            is_offer = "Offer-Nr.:" in message
            is_system_message = "Your inquiry is processed under the number:" in message

            # Create a new row with all original fields except body and plain_body
            new_row = {
                'request_id': row['request_id'],
                'total': row['total'],
                'error': row['error'],
                'subject': row['subject'],
                'mailbox': row['mailbox'],
                'mail': row['mail'],
                'host': row['host'],
                'full': row['full'],
                'date': row['date'],
                'uid': row['uid'],
                'messageId': row['messageId'],
                'is_attachments_exists': row['is_attachments_exists'],
                'message_index': index,
                'message_text': message,
                'is_offer': is_offer,
                'is_system_message': is_system_message
            }

            # Append the row to the CSV
            pd.DataFrame([new_row]).to_csv(output_file, index=False, mode='a', header=False)
        success_count += 1
    except Exception as e:
        error_count += 1
        pbar_errors.update(1)
    finally:
        pbar_total.update(1)

# Close the progress bars
pbar_total.close()
pbar_errors.close()
