In [3]:
import os
import sys
import pandas as pd
import dtale

In [90]:
file_path = r'data\emailsentiment_2.parquet'
df = pd.read_parquet(file_path)
print(df.shape)

(3375, 9)


In [91]:
# d = dtale.show(df)
# d.open_browser()

In [92]:
df.head(2)

Unnamed: 0,id,from_email,to_email,cc_email,subject,body,date_ist,time_ist,projectid
0,af6c2697-de52-4bca-9454-bd9ec04286e8,aaron@constructionchannel.tv,anants@pinnacleinfotech.com,"tanveer.singh@jpr.pinnaclecad.com, lsharma@pin...",Re: [EXTERNAL] Re: ABMS Hangar,"Per our process, did you update Autodesk Const...",2024-04-24,21:55:04,77901
1,33feb1a9-3b8f-4d95-82ee-d5accabc96da,abraham.canlas@gcinc.com,dkataria@pinnacleinfotech.com,"miguel.sapao@gcinc.com, sunilk@pinnacleinfotec...",RE: Tristar Vault 3D Model,"Hi Deepak,\r\nLet me know if you received the ...",2024-04-18,05:43:17,75205


In [93]:
import re
from bs4 import BeautifulSoup

In [94]:
def remove_disclaimer_text(body):
    """
    Remove disclaimer and classification notices from the email body.

    Args:
        body (str): The body of the email.
        
    Returns:
        str: The email body without disclaimer and classification notices.
    """
    disclaimer_pattern = [
        r"\*This e-mail has been classified as.*",
        r".*classification notice.*",
        r"From:\s.*Sent:\s.*To:\s.*"
    ]
    
    for pattern in disclaimer_pattern:
        body = re.sub(pattern, '', body, flags=re.DOTALL)
    
    return body

def extract_recent_email(email_body):
    """
    Extract the most recent part of an email thread above the signature or prior email content.

    Args:
        email_body (str): The body of the email.
        
    Returns:
        str: The most recent part text.
    """
    # Use BeautifulSoup to handle HTML email bodies.
    soup = BeautifulSoup(email_body, "html.parser")
    text = soup.get_text()
    lines = text.splitlines()

    # Regex patterns for detecting common signature lines and previous messages
    signature_patterns = [
        re.compile(r'^(-{2,}|_{2,})'),  # Signatures, like "-----Original Message-----" or "__"
        re.compile(r'^\s*(Sent from my [\w\s]+|Sent with [\w\s]+)$', re.IGNORECASE), # Mobile signatures
        re.compile(r'^On \d{4}/[0-1]\d/[0-3]\d,.*'),  # Replies, like "On 2021/12/01, ... wrote:"
        re.compile(r'^\s*From:\s*.*', re.IGNORECASE), # From header
        re.compile(r'^\s*Sent:\s*.*', re.IGNORECASE), # Sent header
        re.compile(r'^To:\s*.*', re.IGNORECASE),      # To header
        re.compile(r'^Subject:\s*.*', re.IGNORECASE), # Subject header
    ]

    # Find the position of signature or previous message markers
    for i, line in enumerate(lines):
        for pattern in signature_patterns:
            if pattern.match(line):
                return "\n".join(lines[:i]).strip()  # Return everything before the signature marker

    return "\n".join(lines).strip()

def extract_text_above_signature(body, from_email):
    """
    Extract text above the signature in the email body.

    Args:
        body (str): The body of the email.
        from_email (str): The sender's email address.
        
    Returns:
        str: The extracted text above signature patterns.
    """
    username = from_email.split('@')[0]  # Extract the username
    signature_patterns = [
        r"\nfrom:\s", r"\nsent:\s", r"\nto:\s", r"\ncc:\s", r"\nsubject:\s",
        r"\nregards,\s", r"\nbest,\s", r"\nthanks,\s", r"\nsincerely,\s", r"\ncheers,\s",
        re.escape(username.lower())
    ]
    
    body_lower = body.lower()
    signature_indices = []
    
    # Find all occurrences of the signature patterns
    for pattern in signature_patterns:
        matches = list(re.finditer(pattern, body_lower, re.MULTILINE))
        signature_indices.extend([match.start() for match in matches])
    
    if signature_indices:
        signature_index = min(signature_indices)
        return body[:signature_index].strip()
    return body

def clean_email_body(body):
    """
    Clean the email body by removing unwanted text based on various regex patterns.

    Args:
        body (str): The body of the email.
    
    Returns:
        str: The cleaned email body.
    """
    patterns = {
        'urls': r'https?://\S+|www\.\S+',  # URLs
        'metadata': r'(?m)^(From|Sent|To|Cc|Bcc|Subject|Date): .*$',  # Email metadata
        'greetings': r'(?i)^(Hi|Hello|Dear|Greetings|Hey)\s+[^\n]+',  # Greetings
        'signatures': r'(?i)(Best regards|Kind regards|Regards|Cheers|Thank you|Thanks|Sincerely|Yours truly|Yours sincerely|Best|Warm regards|With regards)[^\n]+',  # Signatures
        'email_headers': r'---* Forwarded message ---*|---* Original message ---*|---* Reply Above This Line ---*',  # Email forwarding/reply headers
        'email_addresses': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',  # Email addresses
        'non_ascii': r'[^\x00-\x7F]+',  # Non-ASCII characters
        'extra_lines': r'\n{2,}',  # Excessive newlines
        'html_tags': r'<[^>]*>',  # HTML tags
        'brackets_content': r'\[.*?\]|\{.*?\}|\<.*?\>',  # Content within brackets
        'extra_whitespace': r'\s{2,}',  # Excessive whitespace
        'unsubscribe_links': r'Unsubscribe\s+:.*|Click\s+here.*unsubscribe',  # Unsubscribe links
        'reply_lines': r'On\s+.*wrote:',  # Lines indicating the start of a reply
        'quoted_text': r'(?m)^\>.*$',  # Lines starting with ">"
        'repeated_chars': r'(.)\1{2,}',  # Repeated characters
        'generic_signature_lines': r'(?i)(^Sent from my \w+)|(--\n.*)|(Confidentiality Notice.*)',  # Generic email signatures or legal disclaimers
        'timestamps': r'\d{1,2}:\d{2}\s*(AM|PM|am|pm)?',  # Time stamps
        'dates': r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}',  # Dates
        'signature_blocks': r'--\s*\n[\s\S]*',  # Signature blocks that start with -- followed by any text
        'headers_and_footers': r'(?m)^\s*-\s*$\n[\s\S]*?^\s*-\s*$',  # Headers and footers denoted by lines consisting of dashes
        'mobile_numbers': r'\b(\+?(\d{1,3})?[-. ]?)?((\(\d{1,4}\))|\d{1,4})[-. ]?\d{1,4}[-. ]?\d{1,9}\b'  # Mobile numbers
    }
    
    for key, pattern in patterns.items():
        body = re.sub(pattern, ' ', body)
    
    # Remove excess whitespace
    body = re.sub(r'\s{2,}', ' ', body)
    return body.strip()


def extract_most_recent_email_part(email_body, from_email):
    """
    Combine recent email extraction and signature pattern extraction for robust extraction.

    Args:
        email_body (str): The body of the email.
        from_email (str): The sender's email address.
        
    Returns:
        str: The extracted relevant part of the email above signatures and prior threads.
    """
    # Remove disclaimer and notice sections
    cleaned_body = remove_disclaimer_text(email_body)
    
    # Extract the most recent email section ignoring previous emails
    recent_email_content = extract_recent_email(cleaned_body)
    
    # Extract text above signatures and replies
    filtered_content_above_signature = extract_text_above_signature(recent_email_content, from_email)

    # Clean the email body
    cleaned_body = clean_email_body(filtered_content_above_signature)
    
    return filtered_content_above_signature

In [95]:
# Apply the extraction function
df['clean_email'] = df.apply(lambda row: extract_most_recent_email_part(row['body'], row['from_email']), axis=1)


The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.



In [50]:
# d = dtale.show(test)
# d.open_browser()

### Sentiment Analysis
#### Transformer

In [98]:
from transformers import pipeline

# Define the sentiment analysis pipeline using pretraned model 
sentiment_pipeline = pipeline("sentiment-analysis", model = "distilbert-base-uncased-finetuned-sst-2-english")

In [99]:

# Function to predict sentiment with text truncation
def predict_sentiment_truncated(text, max_length=512):
    truncated_text = text[:max_length]
    result = sentiment_pipeline(truncated_text)
    return result[0] if result else {'label': 'UNKNOWN', 'score': 0.0}

In [100]:
# Apply the sentiment prediction function
df['sentiment_distilbert'] = df['clean_email'].apply(predict_sentiment_truncated)

In [101]:
# Extract the sentiment label and score into separate columns
df['transformer_label'] = df['sentiment_distilbert'].apply(lambda x: x['label'])
# df['sentiment_distilbert_score'] = df['sentiment_distilbert'].apply(lambda x: x['score'])

In [104]:
print(df['transformer_label'].value_counts())

transformer_label
NEGATIVE    2153
POSITIVE    1222
Name: count, dtype: int64

In [117]:
# d = dtale.show(df)
# d.open_browser()

### TextBlob

In [105]:
from textblob import TextBlob

In [106]:
# Function to predict sentiment using TextBlob
def predict_sentiment_textblob(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment
    return sentiment

In [108]:
# Apply the sentiment analysis function
df['sentiment_blob'] = df['clean_email'].apply(predict_sentiment_textblob)

# Extract the sentiment label and score into separate columns
df['blob_label'] = df['sentiment_blob'].apply(lambda x: 'POSITIVE' if x.polarity > 0 else ('NEGATIVE' if x.polarity < 0 else 'NEUTRAL'))
# df['sentiment_blob_score'] = df['sentiment'].apply(lambda x: x.polarity)

In [109]:
print(df.blob_label.value_counts())

blob_label
POSITIVE    2021
NEUTRAL      840
NEGATIVE     514
Name: count, dtype: int64

In [23]:
# d = dtale.show(df)
# d.open_browser()

### NLTK

In [110]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

In [111]:
# Download the VADER lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\pis05408.PINNACLE\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package vader_lexicon is already up-to-date!


In [112]:
# Function to predict sentiment using VADER
def predict_sentiment_nltk(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment

In [113]:
# Apply the sentiment analysis function
df['sentiment_NLTK'] = df['clean_email'].apply(predict_sentiment_nltk)

# Extract the sentiment label and score into separate columns
df['nltk_label'] = df['sentiment_NLTK'].apply(lambda x: 'POSITIVE' if x['compound'] >= 0.05 else ('NEGATIVE' if x['compound'] <= -0.05 else 'NEUTRAL'))
#df['sentiment_nltk_score'] = df['sentiment_NLTK'].apply(lambda x: x['compound'])

In [54]:
print(df['sentiment_nltk_label'].value_counts())

sentiment_nltk_label
POSITIVE    4009
NEUTRAL      564
NEGATIVE     328
Name: count, dtype: int64

In [1]:
# d = dtale.show(df)
# d.open_browser()

### SpaCy

In [116]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

In [117]:
# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

# Add the TextBlob component to the SpaCy pipeline
nlp.add_pipe("spacytextblob")

<spacytextblob.spacytextblob.SpacyTextBlob at 0x22d2e97cd60>

In [118]:
# Function to predict sentiment using SpaCy
def predict_sentiment_spacy(text):
    doc = nlp(text)
    sentiment = {
        'label': 'POSITIVE' if doc._.polarity > 0 else ('NEGATIVE' if doc._.polarity < 0 else 'NEUTRAL'),
        'score': doc._.polarity
    }
    return sentiment

In [119]:
# Apply the sentiment analysis function
df['sentiment_spacy'] = df['clean_email'].apply(predict_sentiment_spacy)

# Extract the sentiment label and score into separate columns
df['spacy_label'] = df['sentiment_spacy'].apply(lambda x: x['label'])
#df['sentiment_spacy_score'] = df['sentiment_spacy'].apply(lambda x: x['score'])

In [121]:
print(df['sentiment_spacy_label'].value_counts())

sentiment_spacy_label
POSITIVE    2021
NEUTRAL      840
NEGATIVE     514
Name: count, dtype: int64

In [143]:
## Writing the data 
# df.to_parquet('data/sentiment_data_output.parquet')

In [4]:
# # Reading the data for validation
# data = pd.read_parquet('data\sentiment_data_output.parquet')

In [146]:
# data.head(5)

In [2]:
# d = dtale.show(data)
# d.open_browser()