In [4]:
import os
import sys
import pandas as pd
import dtale

import re
from bs4 import BeautifulSoup

In [2]:
file_path = r'data\email_3rd_june.parquet'
df = pd.read_parquet(file_path)
print(df.shape)

(10362, 9)


In [91]:
# d = dtale.show(df)
# d.open_browser()

In [3]:
df.head(2)

Unnamed: 0,id,from_email,to_email,cc_email,subject,body,date_ist,time_ist,projectid
0,6d30273a-6d16-4c9a-b207-31261df388ba,a.cao@forell.com,samiranm@pinnacleinfotech.com,"csutradhar@pinnaclecad.com, a.lin@forell.com",RE: 23-044 Drafting,"Hi Sam,\n\nI've uploaded more markups here:\nh...",2024-05-16,05:13:05,76461
1,6b7a51f9-201f-43fa-adc2-f23203ee00d5,a.cao@forell.com,samiranm@pinnacleinfotech.com,"csutradhar@pinnaclecad.com, a.lin@forell.com",RE: 23-043 Drafting,"Sam,\n\nI've uploaded more markups here:\nhttp...",2024-05-18,03:34:35,76461


In [5]:
def remove_disclaimer_text(body):
    """
    Remove disclaimer and classification notices from the email body.

    Args:
        body (str): The body of the email.
        
    Returns:
        str: The email body without disclaimer and classification notices.
    """
    disclaimer_pattern = [
        r"\*This e-mail has been classified as.*",
        r".*classification notice.*",
        r"From:\s.*Sent:\s.*To:\s.*"
    ]
    
    for pattern in disclaimer_pattern:
        body = re.sub(pattern, '', body, flags=re.DOTALL)
    
    return body

def extract_recent_email(email_body):
    """
    Extract the most recent part of an email thread above the signature or prior email content.

    Args:
        email_body (str): The body of the email.
        
    Returns:
        str: The most recent part text.
    """
    # Use BeautifulSoup to handle HTML email bodies.
    soup = BeautifulSoup(email_body, "html.parser")
    text = soup.get_text()
    lines = text.splitlines()

    # Regex patterns for detecting common signature lines and previous messages
    signature_patterns = [
        re.compile(r'^(-{2,}|_{2,})'),  # Signatures, like "-----Original Message-----" or "__"
        re.compile(r'^\s*(Sent from my [\w\s]+|Sent with [\w\s]+)$', re.IGNORECASE), # Mobile signatures
        re.compile(r'^On \d{4}/[0-1]\d/[0-3]\d,.*'),  # Replies, like "On 2021/12/01, ... wrote:"
        re.compile(r'^\s*From:\s*.*', re.IGNORECASE), # From header
        re.compile(r'^\s*Sent:\s*.*', re.IGNORECASE), # Sent header
        re.compile(r'^To:\s*.*', re.IGNORECASE),      # To header
        re.compile(r'^Subject:\s*.*', re.IGNORECASE), # Subject header
    ]

    # Find the position of signature or previous message markers
    for i, line in enumerate(lines):
        for pattern in signature_patterns:
            if pattern.match(line):
                return "\n".join(lines[:i]).strip()  # Return everything before the signature marker

    return "\n".join(lines).strip()

def extract_text_above_signature(body, from_email):
    """
    Extract text above the signature in the email body.

    Args:
        body (str): The body of the email.
        from_email (str): The sender's email address.
        
    Returns:
        str: The extracted text above signature patterns.
    """
    username = from_email.split('@')[0]  # Extract the username
    signature_patterns = [
        r"\nfrom:\s", r"\nsent:\s", r"\nto:\s", r"\ncc:\s", r"\nsubject:\s",
        r"\nregards,\s", r"\nbest,\s", r"\nthanks,\s", r"\nsincerely,\s", r"\ncheers,\s",
        re.escape(username.lower())
    ]
    
    body_lower = body.lower()
    signature_indices = []
    
    # Find all occurrences of the signature patterns
    for pattern in signature_patterns:
        matches = list(re.finditer(pattern, body_lower, re.MULTILINE))
        signature_indices.extend([match.start() for match in matches])
    
    if signature_indices:
        signature_index = min(signature_indices)
        return body[:signature_index].strip()
    return body

def clean_email_body(body):
    """
    Clean the email body by removing unwanted text based on various regex patterns.

    Args:
        body (str): The body of the email.
    
    Returns:
        str: The cleaned email body.
    """
    patterns = {
        'urls': r'https?://\S+|www\.\S+',  # URLs
        'metadata': r'(?m)^(From|Sent|To|Cc|Bcc|Subject|Date): .*$',  # Email metadata
        'greetings': r'(?i)^(Hi|Hello|Dear|Greetings|Hey)\s+[^\n]+',  # Greetings
        'signatures': r'(?i)(Best regards|Kind regards|Regards|Cheers|Thank you|Thanks|Sincerely|Yours truly|Yours sincerely|Best|Warm regards|With regards)[^\n]+',  # Signatures
        'email_headers': r'---* Forwarded message ---*|---* Original message ---*|---* Reply Above This Line ---*',  # Email forwarding/reply headers
        'email_addresses': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',  # Email addresses
        'non_ascii': r'[^\x00-\x7F]+',  # Non-ASCII characters
        'extra_lines': r'\n{2,}',  # Excessive newlines
        'html_tags': r'<[^>]*>',  # HTML tags
        'brackets_content': r'\[.*?\]|\{.*?\}|\<.*?\>',  # Content within brackets
        'extra_whitespace': r'\s{2,}',  # Excessive whitespace
        'unsubscribe_links': r'Unsubscribe\s+:.*|Click\s+here.*unsubscribe',  # Unsubscribe links
        'reply_lines': r'On\s+.*wrote:',  # Lines indicating the start of a reply
        'quoted_text': r'(?m)^\>.*$',  # Lines starting with ">"
        'repeated_chars': r'(.)\1{2,}',  # Repeated characters
        'generic_signature_lines': r'(?i)(^Sent from my \w+)|(--\n.*)|(Confidentiality Notice.*)',  # Generic email signatures or legal disclaimers
        'timestamps': r'\d{1,2}:\d{2}\s*(AM|PM|am|pm)?',  # Time stamps
        'dates': r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',  # Dates
        'signature_blocks': r'--\s*\n[\s\S]*',  # Signature blocks that start with -- followed by any text
        'headers_and_footers': r'(?m)^\s*-\s*$\n[\s\S]*?^\s*-\s*$',  # Headers and footers denoted by lines consisting of dashes
        'mobile_numbers': r'\b(\+?(\d{1,3})?[-. ]?)?((\(\d{1,4}\))|\d{1,4})[-. ]?\d{1,4}[-. ]?\d{1,9}\b'  # Mobile numbers
    }
    
    for key, pattern in patterns.items():
        body = re.sub(pattern, ' ', body)
    
    # Remove excess whitespace
    body = re.sub(r'\s{2,}', ' ', body)
    return body.strip()


def extract_most_recent_email_part(email_body, from_email):
    """
    Combine recent email extraction and signature pattern extraction for robust extraction.

    Args:
        email_body (str): The body of the email.
        from_email (str): The sender's email address.
        
    Returns:
        str: The extracted relevant part of the email above signatures and prior threads.
    """
    # Remove disclaimer and notice sections
    cleaned_body = remove_disclaimer_text(email_body)
    
    # Extract the most recent email section ignoring previous emails
    recent_email_content = extract_recent_email(cleaned_body)
    
    # Extract text above signatures and replies
    filtered_content_above_signature = extract_text_above_signature(recent_email_content, from_email)

    # Clean the email body
    cleaned_body = clean_email_body(filtered_content_above_signature)
    
    return filtered_content_above_signature

In [6]:
# Apply the extraction function
df['clean_email'] = df.apply(lambda row: extract_most_recent_email_part(row['body'], row['from_email']), axis=1)


The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.



In [50]:
# d = dtale.show(test)
# d.open_browser()

### Sentiment Analysis

### TextBlob

In [7]:
from textblob import TextBlob

In [8]:
# Function to predict sentiment using TextBlob
def predict_sentiment_textblob(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment
    return sentiment

In [9]:
# Apply the sentiment analysis function
df['sentiment_blob'] = df['clean_email'].apply(predict_sentiment_textblob)

# Extract the sentiment label and score into separate columns
df['blob_label'] = df['sentiment_blob'].apply(lambda x: 'POSITIVE' if x.polarity > 0 else ('NEGATIVE' if x.polarity < 0 else 'NEUTRAL'))
df['blob_score'] = df['sentiment_blob'].apply(lambda x: x.polarity)

In [10]:
print(df.blob_label.value_counts())

blob_label
POSITIVE    6156
NEUTRAL     2664
NEGATIVE    1542
Name: count, dtype: int64


In [11]:
# Apply the sentiment analysis function
df['sentiment_blob'] = df['body'].apply(predict_sentiment_textblob)

# Extract the sentiment label and score into separate columns
df['blob_label_test'] = df['sentiment_blob'].apply(lambda x: 'POSITIVE' if x.polarity > 0 else ('NEGATIVE' if x.polarity < 0 else 'NEUTRAL'))

In [12]:
print(df.blob_label_test.value_counts())

blob_label_test
POSITIVE    8516
NEGATIVE    1623
NEUTRAL      223
Name: count, dtype: int64


In [23]:
# d = dtale.show(df)
# d.open_browser()

### NLTK

In [13]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

In [14]:
# Download the VADER lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\pis05408.PINNACLE\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package vader_lexicon is already up-to-date!


In [15]:
# Function to predict sentiment using VADER
def predict_sentiment_nltk(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment

In [16]:
# Apply the sentiment analysis function
df['sentiment_NLTK'] = df['clean_email'].apply(predict_sentiment_nltk)

# Extract the sentiment label and score into separate columns
df['nltk_label'] = df['sentiment_NLTK'].apply(lambda x: 'POSITIVE' if x['compound'] >= 0.05 else ('NEGATIVE' if x['compound'] <= -0.05 else 'NEUTRAL'))
df['nltk_score'] = df['sentiment_NLTK'].apply(lambda x: x['compound'])

In [17]:
# Apply the sentiment analysis function
df['sentiment_NLTK'] = df['body'].apply(predict_sentiment_nltk)

# Extract the sentiment label and score into separate columns
df['nltk_label_test'] = df['sentiment_NLTK'].apply(lambda x: 'POSITIVE' if x['compound'] >= 0.05 else ('NEGATIVE' if x['compound'] <= -0.05 else 'NEUTRAL'))

In [18]:
print(df['nltk_label'].value_counts())

nltk_label
POSITIVE    8483
NEUTRAL     1238
NEGATIVE     641
Name: count, dtype: int64


In [19]:
print(df['nltk_label_test'].value_counts())

nltk_label_test
POSITIVE    9879
NEGATIVE     374
NEUTRAL      109
Name: count, dtype: int64


In [19]:
# d = dtale.show(df)
# d.open_browser()

### Email Summarization

In [22]:
from transformers import pipeline

In [42]:
# # Initialize the summarization pipeline with a specific model
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
summarizer = pipeline("summarization", model="t5-large")

Downloading: 100%|██████████| 1.18k/1.18k [00:00<?, ?B/s]
Downloading: 100%|██████████| 2.75G/2.75G [07:32<00:00, 6.53MB/s]
Downloading: 100%|██████████| 773k/773k [00:00<00:00, 885kB/s] 
Downloading: 100%|██████████| 1.32M/1.32M [00:01<00:00, 1.23MB/s]

This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.



In [43]:
# Function to summarize a single email text
def summarize_email(email_text):
    try:
        summary = summarizer(email_text, max_length=60, min_length=25, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        print(f"Error summarizing text: {e}")
        return "Summary not available"

In [44]:
# test_df = df.head(10).copy()

In [48]:
# Apply the summarization function to each email text and create a new column
test_df['summary_1'] = test_df['body'].apply(summarize_email)

Token indices sequence length is longer than the specified maximum sequence length for this model (5610 > 512). Running this sequence through the model will result in indexing errors


2024-06-06 16:54:05,434 - INFO     - Executing shutdown due to inactivity...
2024-06-06 16:54:09,736 - INFO     - Executing shutdown...
2024-06-06 16:54:09,742 - INFO     - Not running with the Werkzeug Server, exiting by searching gc for BaseWSGIServer

torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead



Error summarizing text: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 33944377600 bytes.


In [47]:
test_df.head()

Unnamed: 0,id,from_email,to_email,cc_email,subject,body,date_ist,time_ist,projectid,clean_email,sentiment_blob,blob_label,blob_score,blob_label_test,sentiment_NLTK,nltk_label,nltk_score,nltk_label_test,summary_2
0,6d30273a-6d16-4c9a-b207-31261df388ba,a.cao@forell.com,samiranm@pinnacleinfotech.com,"csutradhar@pinnaclecad.com, a.lin@forell.com",RE: 23-044 Drafting,"Hi Sam,\n\nI've uploaded more markups here:\nh...",2024-05-16,05:13:05,76461,"Hi Sam,\n\nI've uploaded more markups here:\nh...","(0.12369146005509643, 0.43841597796143267)",POSITIVE,0.22,POSITIVE,"{'neg': 0.011, 'neu': 0.909, 'pos': 0.08, 'com...",POSITIVE,0.5574,POSITIVE,please complete drafting for 23-043 before mov...
1,6b7a51f9-201f-43fa-adc2-f23203ee00d5,a.cao@forell.com,samiranm@pinnacleinfotech.com,"csutradhar@pinnaclecad.com, a.lin@forell.com",RE: 23-043 Drafting,"Sam,\n\nI've uploaded more markups here:\nhttp...",2024-05-18,03:34:35,76461,"Sam,\n\nI've uploaded more markups here:\nhttp...","(0.07164435768730253, 0.4860708310094817)",POSITIVE,0.5,POSITIVE,"{'neg': 0.018, 'neu': 0.923, 'pos': 0.059, 'co...",NEUTRAL,0.0,POSITIVE,markups from acs-001.s3.amazonaws.com/Forell+E...
2,4c74d3e9-230c-4f6b-9e36-3a9986d162d6,a.singh@dmscorp.ca,"anupamp@pinnacleinfotech.com, a.elshafi@rmbser...","a.white@rmbservices.ca, c.ferguson@rmbservices...",RE: Exhaust Louver Size - McMaster,"Arghya,\n\nPlease respond to the discrepancy r...",2024-05-13,21:12:48,76187,"Arghya,\n\nPlease respond to the discrepancy r...","(0.30833333333333335, 0.38333333333333336)",NEUTRAL,0.0,POSITIVE,"{'neg': 0.04, 'neu': 0.893, 'pos': 0.067, 'com...",POSITIVE,0.3182,POSITIVE,"arghya, please respond to the discrepancy rais..."
3,af6c2697-de52-4bca-9454-bd9ec04286e8,aaron@constructionchannel.tv,anants@pinnacleinfotech.com,"tanveer.singh@jpr.pinnaclecad.com, lsharma@pin...",Re: [EXTERNAL] Re: ABMS Hangar,"Per our process, did you update Autodesk Const...",2024-04-24,21:55:04,77901,"Per our process, did you update Autodesk Const...","(0.11065465147097799, 0.4195857996878405)",NEUTRAL,0.0,POSITIVE,"{'neg': 0.006, 'neu': 0.91, 'pos': 0.084, 'com...",NEUTRAL,0.0,POSITIVE,did you update autodesk construction cloud too...
4,33feb1a9-3b8f-4d95-82ee-d5accabc96da,abraham.canlas@gcinc.com,dkataria@pinnacleinfotech.com,"miguel.sapao@gcinc.com, sunilk@pinnacleinfotec...",RE: Tristar Vault 3D Model,"Hi Deepak,\r\nLet me know if you received the ...",2024-04-18,05:43:17,75205,"Hi Deepak,\nLet me know if you received the em...","(0.22407407407407406, 0.5440740740740742)",POSITIVE,0.65,POSITIVE,"{'neg': 0.0, 'neu': 0.9, 'pos': 0.1, 'compound...",POSITIVE,0.4576,POSITIVE,"Abraham canlas, engineer III, guam - granite c..."


In [41]:
d = dtale.show(test_df)
d.open_browser()

In [20]:
# Writing the data 
df.to_parquet('data/sentiment_data_output_3rdjun.parquet')

In [21]:
# Reading the data for validation
data = pd.read_parquet('data\sentiment_data_output_3rdjun.parquet')

In [146]:
# data.head(5)

In [22]:
d = dtale.show(data)
d.open_browser()

2024-06-03 21:27:24,175 - INFO     - Executing shutdown due to inactivity...
2024-06-03 21:27:28,275 - INFO     - Executing shutdown...
2024-06-03 21:27:28,281 - INFO     - Not running with the Werkzeug Server, exiting by searching gc for BaseWSGIServer
