In [1]:
import warnings
from importlib import reload

warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import sys
from tqdm.auto import tqdm, trange
from bs4 import BeautifulSoup
import os

# Load the dataset
df = pd.read_csv('concurrent_requisition_results.csv')

In [4]:
df.shape

(124449, 14)

In [5]:
df.columns

Index(['request_id', 'total', 'error', 'subject', 'body', 'plain_body',
       'mailbox', 'mail', 'host', 'full', 'date', 'uid', 'messageId',
       'is_attachments_exists'],
      dtype='object')

In [6]:
df_unique = df.drop_duplicates(subset='messageId', keep='first')


In [7]:
df_unique.shape

(560, 14)

## Split HTML bodies into messages

In [109]:
import pandas as pd
import hashlib
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
import os

sys.path.append('/Users/valuamba/projs/components_agent_sales/app/utils')
from html_messages_parser import get_element_messages

# Load the dataset
df = pd.read_csv('requisitions_results_by_client.csv')

df = df.iloc[4000:]
df_unique = df.drop_duplicates(subset='messageId', keep='first')

# Filter rows where body is not null
df_filtered = df_unique[df_unique['body'].notna()]

# Set up the CSV file
output_file = 'classified_messages.csv'
columns = [
    'request_id', 'total', 'error', 'subject', 'mailbox', 'mail', 'host', 
    'full', 'date', 'messageId', 'is_attachments_exists', 
    'message_index', 'message_text', 'is_offer', 'is_system_message',
    'message_hash'
]

# Check if the file exists and is non-empty
file_exists = os.path.exists(output_file)
write_header = not file_exists or os.path.getsize(output_file) == 0

# Write the header if the file is empty or doesn't exist
if write_header:
    pd.DataFrame(columns=columns).to_csv(output_file, index=False)

# Initialize counters for successes and failures
success_count = 0
error_count = 0

# Create progress bars
pbar_total = tqdm(total=len(df_filtered), desc="Total Processing", position=0, leave=True)
pbar_errors = tqdm(total=0, desc="Failed Handlings", position=1, leave=True, bar_format='{l_bar}{bar}| {n} failed')

# Process each row
for _, row in df_filtered.iterrows():
    try:
        soup = BeautifulSoup(row['body'], "html.parser")
        root_element = soup.find('body') if soup.find('body') else soup
        messages = get_element_messages(root_element)

        # Loop through messages from latest to oldest
        for index, message in enumerate(reversed(messages), start=1):
            is_offer = "Offer-Nr.:" in message
            is_system_message = "Your inquiry is processed under the number:" in message

            # Generate a hash for the message text
            message_hash = hashlib.sha256(message.encode('utf-8')).hexdigest()

            # Create a new row with all original fields except body and plain_body
            new_row = {
                'request_id': row['request_id'],
                'total': row['total'],
                'error': row['error'],
                'subject': row['subject'],
                'mailbox': row['mailbox'],
                'mail': row['mail'],
                'host': row['host'],
                'full': row['full'],
                'date': row['date'],
                'messageId': row['messageId'],
                'is_attachments_exists': row['is_attachments_exists'],
                'message_index': index,
                'message_text': message,
                'is_offer': is_offer,
                'is_system_message': is_system_message,
                'message_hash': message_hash
            }

            # Append the row to the CSV
            pd.DataFrame([new_row]).to_csv(output_file, index=False, mode='a', header=False)
        success_count += 1
    except AssertionError as ae:
        error_count += 1
        pbar_errors.update(1)
    except Exception as e:
        error_message = f"Error processing row with request_id: {row['request_id']}, error: {e}"
        error_count += 1
        pbar_errors.update(1)
    finally:
        pbar_total.update(1)

# Close the progress bars
pbar_total.close()
pbar_errors.close()

# Check for repeated messages
df_classified = pd.read_csv(output_file)
repeated_messages = df_classified[df_classified.duplicated(subset='message_hash', keep=False)]
print(repeated_messages)


Total Processing:   0%|          | 0/41094 [00:00<?, ?it/s]

Failed Handlings: |          | 0 failed

Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquote
Table blockquo

In [95]:
df = pd.read_csv('requisitions_results_by_client.csv')


value_counts = df['request_id'].value_counts()

print(value_counts)

request_id
407887    20
299627    20
463759    20
300254    20
413964    20
          ..
433444     1
343033     1
328017     1
460014     1
403644     1
Name: count, Length: 26708, dtype: int64


In [102]:
df = pd.read_csv('classified_messages.csv')

value_counts = df['message_hash'].value_counts()

print(value_counts)

# df

Series([], Name: count, dtype: int64)


In [52]:
# body = df_filtered[df_filtered['request_id'] == 444025].loc[5, 'body']

row = df_filtered[df_filtered['request_id'] == 444025]
body = row.iloc[1]['body']

soup = BeautifulSoup(body, "html.parser")
root_element = soup.find('body') if soup.find('body') else soup
messages = get_element_messages(root_element)

In [None]:
messages

In [45]:
row = df_filtered[df_filtered['request_id'] == 444025]
row['body']

38    <html xmlns:v="urn:schemas-microsoft-com:vml" ...
39    <!DOCTYPE html>\r\n<html>\r\n  <head>\r\n\r\n ...
Name: body, dtype: object

38    <html xmlns:v="urn:schemas-microsoft-com:vml" ...
39    <!DOCTYPE html>\r\n<html>\r\n  <head>\r\n\r\n ...
Name: body, dtype: object


In [57]:
import pandas as pd

df = pd.read_csv('requisitions_results_by_client.csv')

In [58]:
df.shape

(38993, 14)

In [40]:
df_filtered

Unnamed: 0,client_id,request_id,total,error,messageId,date,subject,body,plain_body,mail,mailbox,host,full,is_attachments_exists
2,117202,458567,1,,SIDGdsH3my8SsfOFK6uUuWk7cwSj1nbYMovSaq1gWs@fam...,2023-11-29 11:01:08,"Offer ???458567 29.11.2023, ELGO Electronic ||...",<style>\n @page{header:otherHeader;foot...,,kg1@famaga.de,kg1,famaga.de,kg1@famaga.de,True
5,34051,441419,1,,rKYnmaDCNDb6qIGq1qHclzoJxcXwnJOYU8Q5PaFbFM@fam...,2023-09-26 12:59:59,"Angebot ???441419 26.09.2023, MD Micro Detecto...",<style>\n @page{header:otherHeader;foot...,,lb@famaga.de,lb,famaga.de,lb@famaga.de,True
6,28413,441386,1,,fUwMLM1tE3tAdlk6s8aCrn0qk5trZQ6uoolIHgDdk@fama...,2023-09-26 19:24:37,"Angebot ???441386 26.09.2023, KNOLL Maschinenb...",<style>\n @page{header:otherHeader;foot...,,lb@famaga.de,lb,famaga.de,lb@famaga.de,True
9,57603,426202,4,,RO2PR80MB6722C11ACF560F8930548572CEF2A@RO2PR80...,2023-09-11 09:52:11,"RES: RES: Offer ???426202 10.08.2023, Maxon Mo...","<html xmlns:v=""urn:schemas-microsoft-com:vml"" ...",Bom Dia Adrian tudo bem?\r\n\r\nA pessoa que t...,RICARDOLINS@natura.net,RICARDOLINS,natura.net,RICARDO DE OLIVEIRA NOBRE LINS <RICARDOLINS@na...,False
10,57603,426202,4,,RO2PR80MB6722547F098F0CD4A928C8DFCEE7A@RO2PR80...,2023-08-29 11:30:10,"RES: Offer ???426202 10.08.2023, Maxon Motors ...","<html xmlns:v=""urn:schemas-microsoft-com:vml"" ...",Olá bom dia tudo bem?\r\n\r\nComo a FAMAGA não...,RICARDOLINS@natura.net,RICARDOLINS,natura.net,RICARDO DE OLIVEIRA NOBRE LINS <RICARDOLINS@na...,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28780,85567,447807,17,,!&!AAAAAAAAAAAuAAAAAAAAABEYSunxbTBAgLc+RX6Zd00...,2023-10-20 12:53:08,"RE: Offer ???447807 19.10.2023, O.M.B. Vibrato...","<html xmlns:v=""urn:schemas-microsoft-com:vml"" ...","Dear Kristine,\r\n\r\n \r\n\r\nPlease find it ...",milos.pusara@trb.ba,milos.pusara,trb.ba,Milo?? Pu??ara <milos.pusara@trb.ba>,False
28781,85567,447807,17,,!&!AAAAAAAAAAAuAAAAAAAAABEYSunxbTBAgLc+RX6Zd00...,2023-10-20 11:31:29,"RE: Offer ???447807 19.10.2023, O.M.B. Vibrato...","<html xmlns:v=""urn:schemas-microsoft-com:vml"" ...","Dear Kristine,\r\n\r\n \r\n\r\nI forgot to att...",milos.pusara@trb.ba,milos.pusara,trb.ba,Milo?? Pu??ara <milos.pusara@trb.ba>,True
28782,85567,447807,17,,!&!AAAAAAAAAAAuAAAAAAAAABEYSunxbTBAgLc+RX6Zd00...,2023-10-20 10:42:41,"RE: Offer ???447807 19.10.2023, O.M.B. Vibrato...","<html xmlns:v=""urn:schemas-microsoft-com:vml"" ...","Dear Kristine,\r\n\r\n \r\n\r\nThank you very ...",milos.pusara@trb.ba,milos.pusara,trb.ba,Milo?? Pu??ara <milos.pusara@trb.ba>,False
28786,82509,481551,2,,urx8A6o8wihgwuncJh75zqy7bXf9hzwST2uZLPzS8g@fam...,2024-02-16 08:54:37,"Offer ???481551 16.02.2024, Rechner || FAMAGA ...",<style>\r\n @page{header:otherHeader;fo...,,kg1@famaga.de,kg1,famaga.de,Kristine Gergaia <kg1@famaga.de>,True


In [33]:
value_counts = df['messageId'].value_counts()

print(value_counts)

messageId
!&!AAAAAAAAAAAYAAAAAAAAAN/Kx0v9g1FDrdYHKwe1B63CgAAAEAAAAMqVeB6JRS9JnJqYNPvLMPIBAAAAAA==@delva.it               9
003001d9e547$6df016b0$49d04410$@hamburg.de                                                                     8
$null                                                                                                          4
!&!AAAAAAAAAAAYAAAAAAAAALcoauLnBlBFjUeUPUIYsKTCgAAAEAAAABSed17ETihMjXbtZJl8LyIBAAAAAA==@ayamaprojects.co.za    3
001601d9ca0c$08500320$18f00960$@eletrofusao.com.br                                                             3
                                                                                                              ..
3gh8ALQ431KeCmzZkf8S1vn7Vxvvsc73QoxfH7HhA@famaga.org                                                           1
014601d9dcc7$27edf890$77c9e9b0$@subministresmoragas.cat                                                        1
016c01d9dcca$36f30120$a4d90360$@subministresmoragas.cat                               

In [None]:
df = pd.read_csv('classified_messages.csv')

df_filtered = df[(df['message_text'].notna()) & (df['is_offer'] == False) & (df['is_system_message'] == False)]

cl_discount = df_filtered[(df_filtered['message_text'].str.contains('discount', na=False))] 

cl_discount.iloc[100:]

In [108]:
df.loc[200, 'message_text']

False

In [81]:
import re

keywords = [
    "discount" ]


# Create a single regex pattern from all keywords
pattern = re.compile(r'\b(' + '|'.join(map(re.escape, keywords)) + r')\b', re.IGNORECASE)

df = pd.read_csv('classified_messages.csv')

df_filtered = df[df['message_text'].notna()]

cl_discount = df_filtered[(df_filtered['message_text'].str.contains('discount', na=False)) 
    & (df_filtered['is_offer'] == False) & (df_filtered['is_system_message'] == False)]

# Display the result
cl_discount

Unnamed: 0,request_id,total,error,subject,mailbox,mail,host,full,date,uid,messageId,is_attachments_exists,message_index,message_text,is_offer,is_system_message
17,502469,1,,Re: Fwd: Reg - Sprecher & Schuh D7M-LF3 KUN118...,kg1,kg1@famaga.de,famaga.de,Kristine Gergaia <kg1@famaga.de>,2024-04-23 15:18:51,156738.0,xO0MktrULKpXNf2GhLEhCZOZSVucAYPnAbyQDkUF3sY@fa...,False,1,-------- Forwarded Message --------Subject:Reg...,False,False
790,502469,1,,Re: Fwd: Reg - Sprecher & Schuh D7M-LF3 KUN118...,kg1,kg1@famaga.de,famaga.de,Kristine Gergaia <kg1@famaga.de>,2024-04-23 15:18:51,156738.0,xO0MktrULKpXNf2GhLEhCZOZSVucAYPnAbyQDkUF3sY@fa...,False,1,-------- Forwarded Message --------Subject:Reg...,False,False
1580,502469,1,,Re: Fwd: Reg - Sprecher & Schuh D7M-LF3 KUN118...,kg1,kg1@famaga.de,famaga.de,Kristine Gergaia <kg1@famaga.de>,2024-04-23 15:18:51,156738.0,xO0MktrULKpXNf2GhLEhCZOZSVucAYPnAbyQDkUF3sY@fa...,False,1,-------- Forwarded Message --------Subject:Reg...,False,False
2373,502469,1,,Re: Fwd: Reg - Sprecher & Schuh D7M-LF3 KUN118...,kg1,kg1@famaga.de,famaga.de,Kristine Gergaia <kg1@famaga.de>,2024-04-23 15:18:51,156738.0,xO0MktrULKpXNf2GhLEhCZOZSVucAYPnAbyQDkUF3sY@fa...,False,1,-------- Forwarded Message --------Subject:Reg...,False,False
3154,502469,1,,Re: Fwd: Reg - Sprecher & Schuh D7M-LF3 KUN118...,kg1,kg1@famaga.de,famaga.de,Kristine Gergaia <kg1@famaga.de>,2024-04-23 15:18:51,156738.0,xO0MktrULKpXNf2GhLEhCZOZSVucAYPnAbyQDkUF3sY@fa...,False,1,-------- Forwarded Message --------Subject:Reg...,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164013,502469,1,,Re: Fwd: Reg - Sprecher & Schuh D7M-LF3 KUN118...,kg1,kg1@famaga.de,famaga.de,Kristine Gergaia <kg1@famaga.de>,2024-04-23 15:18:51,156738.0,xO0MktrULKpXNf2GhLEhCZOZSVucAYPnAbyQDkUF3sY@fa...,False,1,-------- Forwarded Message --------Subject:Reg...,False,False
164823,502469,1,,Re: Fwd: Reg - Sprecher & Schuh D7M-LF3 KUN118...,kg1,kg1@famaga.de,famaga.de,Kristine Gergaia <kg1@famaga.de>,2024-04-23 15:18:51,156738.0,xO0MktrULKpXNf2GhLEhCZOZSVucAYPnAbyQDkUF3sY@fa...,False,1,-------- Forwarded Message --------Subject:Reg...,False,False
165610,502469,1,,Re: Fwd: Reg - Sprecher & Schuh D7M-LF3 KUN118...,kg1,kg1@famaga.de,famaga.de,Kristine Gergaia <kg1@famaga.de>,2024-04-23 15:18:51,156738.0,xO0MktrULKpXNf2GhLEhCZOZSVucAYPnAbyQDkUF3sY@fa...,False,1,-------- Forwarded Message --------Subject:Reg...,False,False
166378,502469,1,,Re: Fwd: Reg - Sprecher & Schuh D7M-LF3 KUN118...,kg1,kg1@famaga.de,famaga.de,Kristine Gergaia <kg1@famaga.de>,2024-04-23 15:18:51,156738.0,xO0MktrULKpXNf2GhLEhCZOZSVucAYPnAbyQDkUF3sY@fa...,False,1,-------- Forwarded Message --------Subject:Reg...,False,False


In [94]:
value_counts = df['request_id'].value_counts()

print(value_counts)

request_id
502210    19474
383005     6862
504509     3392
504547     2954
504769     2332
          ...  
416614        1
453508        1
484506        1
433227        1
447878        1
Name: count, Length: 9379, dtype: int64


In [84]:
cl_discount.head()
cl_discount.shape

(214, 16)

In [86]:
# row = df_filtered[df_filtered['request_id'] == 502469]
row = cl_discount.iloc[100]

print(row['message_text'])

-------- Forwarded Message --------Subject:Reg - Sprecher & Schuh D7M-LF3Date:Tue, 23 Apr 2024 11:10:00 +0300From:Operations Minetrade Ltd<operations@minetrade.co.tz>To:info@famaga.co.zaGood
          day team,Can you please share price and
            availability for the item in the below linkKindly give your best discounted
            price so that we can proceed further with the payment.Sprecher &
                    Schuh D7M-LF3 ILLUMINATED3EAhttps://www.southerncontrols.com/products/D7M-LF3Kind
                              Regards,Vivek
                              ArputharajMine
                              Trade Ltd | Dar Es Salaam Branch |
                              Mshihiri Street |Opp.
                              Apollo Hospital |T:
                              +255 22 2131559 |M:
                              +255 757 246 699 |Operations@minetrade.co.tz


In [85]:
row = cl_discount.iloc[200]

print(row['message_text'])

-------- Forwarded Message --------Subject:Reg - Sprecher & Schuh D7M-LF3Date:Tue, 23 Apr 2024 11:10:00 +0300From:Operations Minetrade Ltd<operations@minetrade.co.tz>To:info@famaga.co.zaGood
          day team,Can you please share price and
            availability for the item in the below linkKindly give your best discounted
            price so that we can proceed further with the payment.Sprecher &
                    Schuh D7M-LF3 ILLUMINATED3EAhttps://www.southerncontrols.com/products/D7M-LF3Kind
                              Regards,Vivek
                              ArputharajMine
                              Trade Ltd | Dar Es Salaam Branch |
                              Mshihiri Street |Opp.
                              Apollo Hospital |T:
                              +255 22 2131559 |M:
                              +255 757 246 699 |Operations@minetrade.co.tz


In [None]:
df

In [27]:
df_filtered = df[df['body'].notna()]


(3264, 14)

In [None]:
df