In [17]:
import pandas as pd
from bs4 import BeautifulSoup
import html
import re
import email
from email import policy
from email.parser import BytesParser
import base64
import quopri
import chardet



def decode_email_bytes(raw_bytes):
    """Parse and decode full email content from raw bytes."""
    try:
        msg = BytesParser(policy=policy.default).parsebytes(raw_bytes)
    except:
        return ""

    decoded_parts = []

    for part in msg.walk():
        if part.get_content_maintype() == 'multipart':
            continue

        encoding = part.get("Content-Transfer-Encoding", "").lower()
        charset = part.get_content_charset() or 'utf-8'
        payload = part.get_payload(decode=True)

        if not payload:
            continue

        try:
            # Decode encoding if needed
            if encoding == 'base64':
                payload = base64.b64decode(payload)
            elif encoding == 'quoted-printable':
                payload = quopri.decodestring(payload)

            # Detect charset if needed
            if not charset or charset == 'unknown-8bit':
                charset = chardet.detect(payload)['encoding'] or 'utf-8'

            decoded_text = payload.decode(charset, errors='replace')
            decoded_parts.append(decoded_text)

        except Exception as e:
            print(f"Decode error: {e}")
            continue

    return "\n\n".join(decoded_parts)


def extract_email_address(text):
    if pd.isna(text):
        return ""
    match = re.search(r'[\w\.-]+@[\w\.-]+', text)
    return match.group(0) if match else text.strip()

def clean_text(text, remove_disclaimer=True):
    if pd.isna(text):
        return ""


    try:
        text = text.encode('latin1').decode('utf-8')
    except:
        pass

    text = html.unescape(text)
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'[\t\r\n]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    if remove_disclaimer:
        disclaimer_phrases = [
            "This email, including any attachments and files transmitted with it",
            "If you are not the intended recipient",
            "please contact the sender by reply email",
            "Any unauthorized review, use, disclosure or distribution is prohibited"
        ]
        for phrase in disclaimer_phrases:
            if phrase in text:
                text = text.split(phrase)[0]

    return text.strip()

def decode_and_clean_column(series):
    """Decode raw email bytes and clean the result."""
    decoded_cleaned = []
    for content in series:
        try:
            if isinstance(content, str):
                raw_bytes = content.encode('utf-8', errors='replace')
                decoded = decode_email_bytes(raw_bytes)
                cleaned = clean_text(decoded)
                decoded_cleaned.append(cleaned)
            else:
                decoded_cleaned.append("")
        except Exception as e:
            decoded_cleaned.append("")
    return decoded_cleaned


df1 = pd.read_csv("Nazario.csv")
df2 = pd.read_csv("Nigerian_Fraud.csv")
df3 = pd.read_csv("SpamAssasin.csv")
df4 = pd.read_csv("CEAS_08.csv")


df1['label'] = 1
df2['label'] = 1

# Combine phishing and ham
phishing_df = pd.concat([df1, df2], ignore_index=True)
ham_df = pd.concat([df3, df4], ignore_index=True)
ham_df = ham_df[ham_df['label'] == 0].reset_index(drop=True)

phishing_df = phishing_df.drop_duplicates().dropna()
ham_df = ham_df.drop_duplicates().dropna()


email_column = None
for col in ['subject', 'body']:
    if col in phishing_df.columns:
        email_column = col
        break

if email_column is None:
    raise ValueError("Couldn't find a valid email body column like 'raw', 'text', 'body', etc.")


phishing_df['clean_text'] = decode_and_clean_column(phishing_df[email_column])
ham_df['clean_text'] = decode_and_clean_column(ham_df[email_column])


for col in ['sender', 'receiver']:
    if col in phishing_df.columns:
        phishing_df[col] = phishing_df[col].apply(extract_email_address)
    if col in ham_df.columns:
        ham_df[col] = ham_df[col].apply(extract_email_address)

phishing_df.to_csv("phishing_dataset_final.csv", index=False)
ham_df.to_csv("ham_dataset_final.csv", index=False)


  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()


In [18]:
ham_clean = pd.read_csv("ham_dataset_final.csv")
ham_clean

Unnamed: 0,sender,receiver,date,subject,body,label,urls,clean_text
0,kre@munnari.OZ.AU,cwg-dated-1030377287.06fa6d@DeepEddy.Com,"Thu, 22 Aug 2002 18:26:25 +0700",Re: New Sequences Window,"Date: Wed, 21 Aug 2002 10:54:46 -0500 ...",0,1,
1,Steve_Burt@cursor-system.com,zzzzteana@yahoogroups.com,"Thu, 22 Aug 2002 12:46:18 +0100",[zzzzteana] RE: Alexander,"Martin A posted:\nTassos Papadopoulos, the Gre...",0,1,[zzzzteana] RE: Alexander
2,timc@2ubh.com,zzzzteana@yahoogroups.com,"Thu, 22 Aug 2002 13:52:38 +0100",[zzzzteana] Moscow bomber,Man Threatens Explosion In Moscow \n\nThursday...,0,1,[zzzzteana] Moscow bomber
3,monty@roscom.com,undisclosed-recipient: ;,"Thu, 22 Aug 2002 09:15:25 -0400",[IRR] Klez: The Virus That Won't Die,Klez: The Virus That Won't Die\n \nAlready the...,0,1,[IRR] Klez: The Virus That Won't Die
4,Stewart.Smith@ee.ed.ac.uk,zzzzteana@yahoogroups.com,"Thu, 22 Aug 2002 14:38:22 +0100",Re: [zzzzteana] Nothing like mama used to make,"> in adding cream to spaghetti carbonara, whi...",0,1,
...,...,...,...,...,...,...,...,...
20761,vrcjauctt@gmail.com,pvosgpr@triptracker.net,"Fri, 08 Aug 2008 05:59:51 -0800",I want to cancel my account,How do I cancel my account. I want to erase i...,0,0,I want to cancel my account
20762,kppyozizjt@site5.com,wkilxloc@opensuse.org,"Fri, 08 Aug 2008 15:00:20 +0100",RE: [opensuse] Apache and SSL,\nI don't use virtual hosts. Here is the entry...,0,1,
20763,xpojhbz@gmail.com,fxgmqwjn@triptracker.net,"Fri, 08 Aug 2008 22:00:43 +0800",Slideshow viewer,Hello there ! \nGreat work on the slide show v...,0,0,Slideshow viewer
20764,vupzesm@columbia.edu,zqoqi@spamassassin.apache.org,"Fri, 08 Aug 2008 09:00:46 -0500",Note on 2-digit years,"\nMail from sender , coming from intuit.com\ns...",0,0,Note on 2-digit years


In [19]:
phishing_clean = pd.read_csv("phishing_dataset_final.csv")
phishing_clean

Unnamed: 0,sender,receiver,date,subject,body,urls,label,clean_text
0,service@cpanel.com,jose@monkey.org,"Fri, 30 Oct 2015 00:00:48 -0500",Verify Your Account,Business with \t\t\t\t\t\t\t\tcPanel & WHM \t...,1,1,Verify Your Account
1,AnGarcia@mcoe.org,info@maaaaa.org,"Fri, 30 Oct 2015 14:54:33 +0000",IT-Service Help Desk,Password will expire in 3 days. Click Here To ...,0,1,IT-Service Help Desk
2,usaaacctupdate@sccu4u.com,usaaacctupdate@sccu4u.com,"Fri, 30 Oct 2015 14:02:33 -0500",Final USAA Reminder - Update Your Account Now,"To ensure delivery to your inbox, please add U...",1,1,Final USAA Reminder - Update Your Account Now
3,infos@2015p.com,jose@monkey.org,"Sat, 31 Oct 2015 13:38:13 +0000",=?utf-8?Q?Dear=20Client=20=3a=20Update=20Your=...,"PayPal Secure Dear Client, ...",1,1,=?utf-8?Q?Dear=20Client=20=3a=20Update=20Your=...
4,Update,Recipients,"Sat, 31 Oct 2015 20:44:17 +0530",Update,08/10/2011 06:14:36 am +0800 - en_US.UTF-8 - O...,1,1,Update
...,...,...,...,...,...,...,...,...
3034,onlineli@phreego.com,SunocoSunOil@yahoo.co.jp,"Mon, 27 Aug 2007 09:53:04 -0400",JOB OFFER!!!,"\n\n\nSunoco Sun Oil Company,Ltd.\nTrusty Koji...",0,1,JOB OFFER!!!
3035,nafmohd101@yahoo.fr,webmaster@aclweb.org,"Wed, 29 Aug 2007 22:05:01 +0000",URGENT RESPONSE NEEDED,FROM THE MINISTRE DE LA CONSTRUCTION\n(Mohamed...,0,1,URGENT RESPONSE NEEDED
3036,jamesbongani10015@latinmail.com,jamesbongani10015@latinmail.com,"Sun, 02 Sep 2007 15:51:17 +0200",FROM JAMES BONGANI,"JAMES BONGANI\nST. EDDE, EGLISE,\nRUE 11, AVE ...",1,1,FROM JAMES BONGANI
3037,gladvince01@yahoo.co.uk,R@M,"Thu, 06 Sep 2007 15:41:26 -0400",HELLO,Vincent Cheung\nForeign Operations department\...,0,1,HELLO


In [20]:
import pandas as pd
from bs4 import BeautifulSoup
import html
import re
import email
from email import policy
from email.parser import BytesParser
import base64
import quopri
import chardet


def decode_email_bytes(raw_bytes):
    """Parse and decode full email content from raw bytes."""
    try:
        msg = BytesParser(policy=policy.default).parsebytes(raw_bytes)
    except:
        return ""

    decoded_parts = []

    for part in msg.walk():
        if part.get_content_maintype() == 'multipart':
            continue

        encoding = part.get("Content-Transfer-Encoding", "").lower()
        charset = part.get_content_charset()
        payload = part.get_payload(decode=True)

        if not payload:
            continue

        try:
            if encoding == 'base64':
                payload = base64.b64decode(payload)
            elif encoding == 'quoted-printable':
                payload = quopri.decodestring(payload)

            if not charset or charset.lower() in ['unknown-8bit', 'x-unknown', 'ascii']:
                detected = chardet.detect(payload)
                charset = detected['encoding'] or 'utf-8'

            decoded_text = payload.decode(charset, errors='replace')
            decoded_parts.append(decoded_text)

        except Exception as e:
            print(f"Decode error: {e}")
            continue

    return "\n\n".join(decoded_parts)


def clean_text(text, remove_disclaimer=True):
    if pd.isna(text):
        return ""

    try:
        text = text.encode('latin1').decode('utf-8')
    except:
        try:
            text = text.encode('latin1').decode('gbk')
        except:
            pass

    text = html.unescape(text)
    text = BeautifulSoup(text, "html.parser").get_text()

  
    text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
    text = text.replace('\xa0', ' ')
    text = re.sub(r'[\t\r\n\f\v]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    return text.strip()


def extract_email_address(text):
    if pd.isna(text):
        return ""
    match = re.search(r'[\w\.-]+@[\w\.-]+', text)
    return match.group(0) if match else text.strip()


def decode_and_clean_column(series):
    """Decode raw email bytes and clean the result."""
    decoded_cleaned = []
    for content in series:
        try:
            if isinstance(content, str):
                raw_bytes = content.encode('utf-8', errors='replace')
                decoded = decode_email_bytes(raw_bytes)
                cleaned = clean_text(decoded)
                decoded_cleaned.append(cleaned)
            else:
                decoded_cleaned.append("")
        except Exception as e:
            decoded_cleaned.append("")
    return decoded_cleaned


# Load all datasets
df1 = pd.read_csv("Nazario.csv")
df2 = pd.read_csv("Nigerian_Fraud.csv")
df3 = pd.read_csv("SpamAssasin.csv")
df4 = pd.read_csv("CEAS_08.csv")

# Assign labels if not already present
df1['label'] = 1
df2['label'] = 1


phishing_df = pd.concat([df1, df2], ignore_index=True)
ham_df = pd.concat([df3, df4], ignore_index=True)
ham_df = ham_df[ham_df['label'] == 0].reset_index(drop=True)


phishing_df = phishing_df.drop_duplicates().dropna()
ham_df = ham_df.drop_duplicates().dropna()

email_column = None
for col in ['body', 'subject', 'text']:
    if col in phishing_df.columns:
        email_column = col
        break

if email_column is None:
    raise ValueError("Couldn't find a valid email body column like 'raw', 'text', or 'body'.")


phishing_df['clean_text'] = decode_and_clean_column(phishing_df[email_column])
ham_df['clean_text'] = decode_and_clean_column(ham_df[email_column])

for col in ['sender', 'receiver']:
    if col in phishing_df.columns:
        phishing_df[col] = phishing_df[col].apply(extract_email_address)
    if col in ham_df.columns:
        ham_df[col] = ham_df[col].apply(extract_email_address)


phishing_df.to_csv("phishing_dataset_final.csv", index=False)
ham_df.to_csv("ham_dataset_final.csv", index=False)


  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()


In [21]:
phishing_clean = pd.read_csv("phishing_dataset_final.csv")
phishing_clean

Unnamed: 0,sender,receiver,date,subject,body,urls,label,clean_text
0,service@cpanel.com,jose@monkey.org,"Fri, 30 Oct 2015 00:00:48 -0500",Verify Your Account,Business with \t\t\t\t\t\t\t\tcPanel & WHM \t...,1,1,"Business with cPanel & WHM Dear client, Our Te..."
1,AnGarcia@mcoe.org,info@maaaaa.org,"Fri, 30 Oct 2015 14:54:33 +0000",IT-Service Help Desk,Password will expire in 3 days. Click Here To ...,0,1,Password will expire in 3 days. Click Here To ...
2,usaaacctupdate@sccu4u.com,usaaacctupdate@sccu4u.com,"Fri, 30 Oct 2015 14:02:33 -0500",Final USAA Reminder - Update Your Account Now,"To ensure delivery to your inbox, please add U...",1,1,"To ensure delivery to your inbox, please add U..."
3,infos@2015p.com,jose@monkey.org,"Sat, 31 Oct 2015 13:38:13 +0000",=?utf-8?Q?Dear=20Client=20=3a=20Update=20Your=...,"PayPal Secure Dear Client, ...",1,1,"PayPal Secure Dear Client, We have noticed tha..."
4,Update,Recipients,"Sat, 31 Oct 2015 20:44:17 +0530",Update,08/10/2011 06:14:36 am +0800 - en_US.UTF-8 - O...,1,1,08/10/2011 06:14:36 am +0800 - en_US.UTF-8 - O...
...,...,...,...,...,...,...,...,...
3034,onlineli@phreego.com,SunocoSunOil@yahoo.co.jp,"Mon, 27 Aug 2007 09:53:04 -0400",JOB OFFER!!!,"\n\n\nSunoco Sun Oil Company,Ltd.\nTrusty Koji...",0,1,"Sunoco Sun Oil Company,Ltd. Trusty Kojimachi B..."
3035,nafmohd101@yahoo.fr,webmaster@aclweb.org,"Wed, 29 Aug 2007 22:05:01 +0000",URGENT RESPONSE NEEDED,FROM THE MINISTRE DE LA CONSTRUCTION\n(Mohamed...,0,1,FROM THE MINISTRE DE LA CONSTRUCTION (Mohamed ...
3036,jamesbongani10015@latinmail.com,jamesbongani10015@latinmail.com,"Sun, 02 Sep 2007 15:51:17 +0200",FROM JAMES BONGANI,"JAMES BONGANI\nST. EDDE, EGLISE,\nRUE 11, AVE ...",1,1,"JAMES BONGANI ST. EDDE, EGLISE, RUE 11, AVE 27..."
3037,gladvince01@yahoo.co.uk,R@M,"Thu, 06 Sep 2007 15:41:26 -0400",HELLO,Vincent Cheung\nForeign Operations department\...,0,1,Vincent Cheung Foreign Operations department H...


In [22]:
ham_clean = pd.read_csv("ham_dataset_final.csv")
ham_clean

Unnamed: 0,sender,receiver,date,subject,body,label,urls,clean_text
0,kre@munnari.OZ.AU,cwg-dated-1030377287.06fa6d@DeepEddy.Com,"Thu, 22 Aug 2002 18:26:25 +0700",Re: New Sequences Window,"Date: Wed, 21 Aug 2002 10:54:46 -0500 ...",0,1,
1,Steve_Burt@cursor-system.com,zzzzteana@yahoogroups.com,"Thu, 22 Aug 2002 12:46:18 +0100",[zzzzteana] RE: Alexander,"Martin A posted:\nTassos Papadopoulos, the Gre...",0,1,"Martin A posted: Tassos Papadopoulos, the Gree..."
2,timc@2ubh.com,zzzzteana@yahoogroups.com,"Thu, 22 Aug 2002 13:52:38 +0100",[zzzzteana] Moscow bomber,Man Threatens Explosion In Moscow \n\nThursday...,0,1,Man Threatens Explosion In Moscow Thursday Aug...
3,monty@roscom.com,undisclosed-recipient: ;,"Thu, 22 Aug 2002 09:15:25 -0400",[IRR] Klez: The Virus That Won't Die,Klez: The Virus That Won't Die\n \nAlready the...,0,1,"Already the most prolific virus ever, Klez con..."
4,Stewart.Smith@ee.ed.ac.uk,zzzzteana@yahoogroups.com,"Thu, 22 Aug 2002 14:38:22 +0100",Re: [zzzzteana] Nothing like mama used to make,"> in adding cream to spaghetti carbonara, whi...",0,1,"> in adding cream to spaghetti carbonara, whic..."
...,...,...,...,...,...,...,...,...
20761,vrcjauctt@gmail.com,pvosgpr@triptracker.net,"Fri, 08 Aug 2008 05:59:51 -0800",I want to cancel my account,How do I cancel my account. I want to erase i...,0,0,How do I cancel my account. I want to erase it...
20762,kppyozizjt@site5.com,wkilxloc@opensuse.org,"Fri, 08 Aug 2008 15:00:20 +0100",RE: [opensuse] Apache and SSL,\nI don't use virtual hosts. Here is the entry...,0,1,I don't use virtual hosts. Here is the entry o...
20763,xpojhbz@gmail.com,fxgmqwjn@triptracker.net,"Fri, 08 Aug 2008 22:00:43 +0800",Slideshow viewer,Hello there ! \nGreat work on the slide show v...,0,0,Hello there ! Great work on the slide show vie...
20764,vupzesm@columbia.edu,zqoqi@spamassassin.apache.org,"Fri, 08 Aug 2008 09:00:46 -0500",Note on 2-digit years,"\nMail from sender , coming from intuit.com\ns...",0,0,"Mail from sender , coming from intuit.com serv..."


In [48]:
df = pd.read_csv("final_data.csv")


In [50]:
df

Unnamed: 0.1,Unnamed: 0,id,label,sender,subject,body,date
0,0,4384,0,body.shop@enron.com,BS Anniversary Winners,---------------------- forwarded by body shop/...,2001-05-18
1,1,662,1,"""monkey.org IT MGT""",ACTION REQUIRED? Password Expiry Notice,monkey.org password expiry notice.your passwor...,
2,2,705,1,"""=?utf-8?q?Steven_Hutchinson?=""","Account Security Update 15 July, 2023","we recently detected an unusual activity, we a...",
3,3,5790,0,chris.foster@enron.com,Citizens Arizona Gas Invoicing,kim: this would be a good meeting for you to a...,2001-01-04
4,4,5376,0,janine.ponsart@bakernet.com,E-Notes: Ancillary Services - The Next Frontier,> e-notes provides regular briefings on new de...,
...,...,...,...,...,...,...,...
4995,4995,852,1,=?UTF-8?B?4oCdQWRtaW5pc3RyYXRvckA=?=monkey.org...,RE:Deactivation Request,adminnotificationwe received a mail-box delete...,
4996,4996,3325,0,judy.townsend@enron.com,Deferral Enrollment 2001,---------------------- forwarded by judy towns...,2000-11-30
4997,4997,2397,1,Help Center,We're having some trouble with your current bi...,please update your details.we're having some t...,
4998,4998,1309,1,"""American Express""",Confirming Your Online Gateway Support,"your online gateway supporthello, card membera...",


In [58]:
import pandas as pd

# Load datasets
ham_clean = pd.read_csv("ham_dataset_final.csv")
df = pd.read_csv("final_data.csv")
phishing_clean = pd.read_csv("phishing_dataset_final.csv")

# Columns to retain
cols = ['body', 'sender', 'subject', 'label']

# Prepare ham emails (2000 from ham_clean + 1000 from df with label 0)
ham_2000 = ham_clean[cols].sample(n=2000, random_state=42)
df_ham_1000 = df[df['label'] == 0][cols].sample(n=1000, random_state=42)
ham_combined = pd.concat([ham_2000, df_ham_1000], ignore_index=True)

# Prepare phishing emails (take 3000)
phishing_3000 = phishing_clean[cols].sample(n=3000, random_state=42)

# Combine both to make 6000 entries
final_data = pd.concat([ham_combined, phishing_3000], ignore_index=True)

# Shuffle final dataset
final_data = final_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Show result
print(final_data.head())
print(f"\nFinal dataset shape: {final_data.shape}")


                                                body  \
0  Should I expect problems if my apt server is r...   
1  Dear jose Your mailbox is almost full. 496MB  ...   
2  Rafael Garcia-Suarez skribis 2007-10-17 15:06 ...   
3  senator joe lieberman, congresswoman jane and ...   
4  Dearest in the lord,\n\nI am Mrs mary Clement ...   

                         sender  \
0                   tjb@unh.edu   
1             info@kratzkehl.de   
2          bxaqo@convolution.nl   
3  blandford@smtp.democrats.org   
4     clement_m2002@yahoo.co.in   

                                           subject  label  
0                                  Apt 0.3 and 0.5      0  
1                            Very important notice      1  
2  Re: UTF8 flag missing on stringified references      0  
3               Reception Honoring Gov. Gray Davis      0  
4                                 IN GOD WE TRUST.      1  

Final dataset shape: (6000, 4)


In [62]:
import pandas as pd
import re
import html

# Load datasets
ham_clean = pd.read_csv("ham_dataset_final.csv")
df = pd.read_csv("final_data.csv")
phishing_clean = pd.read_csv("phishing_dataset_final.csv")

# Columns to retain
cols = ['body', 'sender', 'subject', 'label']

# Prepare ham emails
ham_2000 = ham_clean[cols].sample(n=2000, random_state=42)
df_ham_1000 = df[df['label'] == 0][cols].sample(n=1000, random_state=42)
ham_combined = pd.concat([ham_2000, df_ham_1000], ignore_index=True)

# Prepare phishing emails
phishing_3000 = phishing_clean[cols].sample(n=3000, random_state=42)

# Combine both to make 6000 entries
final_data = pd.concat([ham_combined, phishing_3000], ignore_index=True)

# Function to clean email body
def clean_body(text):
    if pd.isnull(text):
        return ""
    text = html.unescape(text)                         # Decode HTML entities
    text = text.encode("ascii", "ignore").decode()     # Remove non-ascii
    text = re.sub(r"\\x[0-9A-Fa-f]{2}", "", text)       # Remove hex codes like \x9f
    text = re.sub(r"[^a-zA-Z0-9\s.,!?@:'\"$%-]", " ", text)  # Remove symbols except punctuation
    text = re.sub(r"\s+", " ", text)                   # Collapse multiple spaces
    return text.strip()

# Apply cleaning to 'body' column
final_data['body'] = final_data['body'].apply(clean_body)

# Shuffle and reset index
final_data = final_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save to new CSV
final_data.to_csv("newdata.csv", index=False)

print("✅ Final cleaned dataset saved as 'newdata.csv'")
print(final_data.head())


✅ Final cleaned dataset saved as 'newdata.csv'
                                                body  \
0  Should I expect problems if my apt server is r...   
1  Dear jose Your mailbox is almost full. 496MB 5...   
2  Rafael Garcia-Suarez skribis 2007-10-17 15:06 ...   
3  senator joe lieberman, congresswoman jane and ...   
4  Dearest in the lord, I am Mrs mary Clement fro...   

                         sender  \
0                   tjb@unh.edu   
1             info@kratzkehl.de   
2          bxaqo@convolution.nl   
3  blandford@smtp.democrats.org   
4     clement_m2002@yahoo.co.in   

                                           subject  label  
0                                  Apt 0.3 and 0.5      0  
1                            Very important notice      1  
2  Re: UTF8 flag missing on stringified references      0  
3               Reception Honoring Gov. Gray Davis      0  
4                                 IN GOD WE TRUST.      1  


In [64]:
import pandas as pd
import re
import html

# Safe load function to handle bad bytes
def safe_read_csv(path):
    try:
        return pd.read_csv(path, encoding='utf-8', on_bad_lines='skip')
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding='ISO-8859-1', on_bad_lines='skip')

# Load all datasets with fallback encodings
ham_clean = safe_read_csv("ham_dataset_final.csv")
df = safe_read_csv("final_data.csv")
phishing_clean = safe_read_csv("phishing_dataset_final.csv")

# Columns to keep
cols = ['body', 'sender', 'subject', 'label']

# Take 2000 ham emails from ham_clean
ham_2000 = ham_clean[cols].dropna().sample(n=2000, random_state=42)

# Take 1000 ham emails from df
df_ham_1000 = df[df['label'] == 0][cols].dropna().sample(n=1000, random_state=42)

# Take 3000 phishing emails
phishing_3000 = phishing_clean[cols].dropna().sample(n=3000, random_state=42)

# Merge all
combined = pd.concat([ham_2000, df_ham_1000, phishing_3000], ignore_index=True)

# Text cleaning function
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = html.unescape(text)                             # Decode HTML entities
    text = text.encode("ascii", "ignore").decode()         # Remove non-ascii
    text = re.sub(r"\\x[0-9A-Fa-f]{2}", "", text)           # Remove hex codes
    text = re.sub(r"[^a-zA-Z0-9\s.,!?@:'\"$%+-]", " ", text) # Remove weird symbols
    text = re.sub(r"\s+", " ", text)                       # Collapse multiple spaces
    return text.strip()

# Apply cleaning to body, subject, sender
for col in ['body', 'subject', 'sender']:
    combined[col] = combined[col].apply(clean_text)

# Shuffle final dataset
combined = combined.sample(frac=1, random_state=42).reset_index(drop=True)

# Save to new CSV
combined.to_csv("newdata_cleaned.csv", index=False)

print("✅ Cleaned dataset saved as 'newdata_cleaned.csv'")
print(combined.head())


✅ Cleaned dataset saved as 'newdata_cleaned.csv'
                                                body  \
0  Should I expect problems if my apt server is r...   
1  Dear jose Your mailbox is almost full. 496MB 5...   
2  Rafael Garcia-Suarez skribis 2007-10-17 15:06 ...   
3  good morning: the electronic copy is too large...   
4  Dearest in the lord, I am Mrs mary Clement fro...   

                      sender  \
0                tjb@unh.edu   
1          info@kratzkehl.de   
2       bxaqo@convolution.nl   
3   cara.semperger@enron.com   
4  clement m2002@yahoo.co.in   

                                             subject  label  
0                                    Apt 0.3 and 0.5      0  
1                              Very important notice      1  
2    Re: UTF8 flag missing on stringified references      0  
3  Path confirmation and Preschedule Workspace in...      0  
4                                   IN GOD WE TRUST.      1  


In [68]:
new= pd.read_csv("newdata_cleaned.csv")

In [76]:
new['body'][50]

"Paul Moore wrote: That looks like it. I'll work up a patch and submit it to the Mercurial developers. I've already got one going. Cheers, Dirkjan Python-Dev mailing list zvllln-eum@python.org http: mail.python.org mailman listinfo python-dev Unsubscribe: http: mail.python.org mailman options python-dev python-dev%40tangomu.com"

In [78]:
new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   body     5997 non-null   object
 1   sender   6000 non-null   object
 2   subject  6000 non-null   object
 3   label    6000 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 187.6+ KB


In [86]:
new.head(10)

Unnamed: 0,body,sender,subject,label
0,Should I expect problems if my apt server is r...,tjb@unh.edu,Apt 0.3 and 0.5,0
1,Dear jose Your mailbox is almost full. 496MB 5...,info@kratzkehl.de,Very important notice,1
2,Rafael Garcia-Suarez skribis 2007-10-17 15:06 ...,bxaqo@convolution.nl,Re: UTF8 flag missing on stringified references,0
3,good morning: the electronic copy is too large...,cara.semperger@enron.com,Path confirmation and Preschedule Workspace in...,0
4,"Dearest in the lord, I am Mrs mary Clement fro...",clement m2002@yahoo.co.in,IN GOD WE TRUST.,1
5,"On Monday 11 February 2008, Magnus Bck wrote: ...",xyn@beeb.net,Re: Configuration query,0
6,"""J"" Jon O writes: J So, my theory is that if t...",garym@canada.com,Re: Asteroids anyone ?,0
7,Apologies for cross-posting ------------------...,jcjttva@unisannio.it,UAI CfP: FUZZ-IEEE 2008. Special session on AG...,0
8,"On Sat, Aug 10, 2002 at 12:00:15AM +0200, Paul...",colmmacc@redbrick.dcu.ie,Re: ILUG ILUG newsgroup s ?,0
9,This code is now open source! Browse it here: ...,hoauf@python.org,Re: Python-3000 Invitation to try out open sou...,0
