In [59]:
import pandas as pd
import re
import numpy as np
from langdetect import detect, LangDetectException

def text_preprocessing(col):
    # Regex pattern to remove urls and emails for better normalization
    url_email_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|'
                                   r'www\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,}|'
                                   r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', re.IGNORECASE)
    # Remove any urls or emails for more normalization
    col = col.str.replace(url_email_pattern, '', regex=True)
    # Replace apostrophes with empty string
    col = col.str.replace(r"'", '', regex=False)
    # Remove all email prefixes (Fw/Re)
    col = col.str.replace(r'(?<!\w)(fw|re)(?=\s*:?)\s*:?\s*', '', case=False, regex=True)
    # Remove punctuations - replace with space instead of empty string to account for stemming
    col = col.str.replace(r'[^\w\s]|[_-]', ' ', regex=True)
    # Replace multiple spaces, tabs(\t), newlines(\n) with single space for uniformity and strip surrounding white space
    col = col.str.replace(r'\s+', ' ', regex=True).str.strip()
    # Convert all nulls to empty string for easier feature extraction
    col = col.fillna('')
    return col

def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

# Data to challenge/test text preprocessing
data = {
    'subject': [
        'FW calibre Important fw:hellowfw:orld update on your account!',
        'Re: Meeting scheduled for tomorrow fw',
        'fw:Please confirm your attendance! :)',
        'https://www.EXAMPLE.com is our official website',
        'FW:OMG PROMO BRO Click this link: www.ABC.org and register now!',
        'Your invoice is attached john.doe@company.com',
        'No subject, but check this out: www.random-site.com',
        'Re    : Urgent action required!!!',
        'FW   : Final reminder for submission!',
        'What\'s happening with the project?',
        'Special offer: Save $$$ on your next purchase!!',
        'Visit WWW.BIGDISCOUNTS.COM for exclusive offers',
        'Random text without special cases',
        'Multiple spaces     and newlines\n\n in this one!',
        'John.Doe@Example.Com invited you to collaborate',
        'Espero que estés bien.',
        'こんにちは世界',
        'Hello Darkness my Old Friend! بات العملاء المبجلين, قررنا نحن '
    ],
    'body': [
        'Dear customer, please check the link: https://securebank.com.',
        'The meeting will be held tomorrow. Contact me at jane.doe@workmail.com',
        'Can\'t wait to see you at the event!! Best regards.',
        'Click on www.EXAMPLE.com to see details.',
        'This is an automated email, contact support@abc.com for any issues.',
        'We have sent you the invoice. Visit https://paymentportal.com for payment.',
        'Here\'s your download link: www.download-site.com/download.zip.',
        'Urgent action required! Please respond immediately.',
        'Final reminder: Submit your project by Friday!',
        'Let\'s have a quick call to discuss further. What\'s your availability?',
        'Use code "SAVE50" for discounts at www.savingshop.com.',
        'Exclusive offers just for you at WWW.SHOPONLINE.COM.',
        'Just checking in, nothing urgent!',
        'This text\nhas multiple\nline breaks\nand spacing issues.',
        'Reach out to me at john.doe@Example.Com foreal:collaboration!',
        'Espero que estés bien.',
        'こんにちは世界',
        'بات العملاء المبجلين, قررنا نحن Its So nice to see you!'
    ]
}

# Convert the data dictionary to a pandas DataFrame
df = pd.DataFrame(data)

# Apply the preprocessing function to the 'email_text' column
df['cleaned_subject'] = text_preprocessing(df['subject'])
df['cleaned_body'] = text_preprocessing(df['body'])

# Drop duplicate rows
df = df.drop_duplicates()

# Replace potential empty strings to null, then drop all nulls
df = df.replace('', np.nan).dropna()

# Exclude non-english rows
df = df[df['subject'].apply(is_english) & df['body'].apply(is_english)]

# Display the cleaned DataFrame
df = df[['subject', 'cleaned_subject', 'body', 'cleaned_body']]
df


Unnamed: 0,subject,cleaned_subject,body,cleaned_body
0,FW calibre Important fw:hellowfw:orld update o...,calibre Important hellowfw orld update on your...,"Dear customer, please check the link: https://...",Dear customer please check the link
1,Re: Meeting scheduled for tomorrow fw,Meeting scheduled for tomorrow,The meeting will be held tomorrow. Contact me ...,The meeting will be held tomorrow Contact me at
2,fw:Please confirm your attendance! :),Please confirm your attendance,Can't wait to see you at the event!! Best rega...,Cant wait to see you at the event Best gards
3,https://www.EXAMPLE.com is our official website,is our official website,Click on www.EXAMPLE.com to see details.,Click on to see details
4,FW:OMG PROMO BRO Click this link: www.ABC.org ...,OMG PROMO BRO Click this link and gister now,"This is an automated email, contact support@ab...",This is an automated email contact for any issues
5,Your invoice is attached john.doe@company.com,Your invoice is attached,We have sent you the invoice. Visit https://pa...,We have sent you the invoice Visit for payment
6,"No subject, but check this out: www.random-sit...",No subject but check this out,Here's your download link: www.download-site.c...,Heres your download link download zip
7,Re : Urgent action required!!!,Urgent action quired,Urgent action required! Please respond immedia...,Urgent action quired Please spond immediately
8,FW : Final reminder for submission!,Final minder for submission,Final reminder: Submit your project by Friday!,Final minder Submit your project by Friday
9,What's happening with the project?,Whats happening with the project,Let's have a quick call to discuss further. Wh...,Lets have a quick call to discuss further What...
