<a href="https://colab.research.google.com/github/Octaxx/DLI-Assignment/blob/main/DatasetCleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import pandas as pd
from sklearn.utils import resample

# STEP 1: Load the dataset
url = "https://raw.githubusercontent.com/Octaxx/DLI-Assignment/refs/heads/main/Phishing_Email.csv"
df = pd.read_csv(url)

# ------------------------------------------------------------------------------------------------------------------------- #

# STEP 2: Data Cleaning
df = df.drop(columns=['Unnamed: 0'])                       # Drop unnecessary column
df = df.dropna(subset=['Email Text'])                      # Remove rows with missing email text
df['Email Type'] = df['Email Type'].str.strip()            # Strip whitespace
df['Label'] = df['Email Type'].map({'Phishing Email': 1, 'Safe Email': 0})  # Encode labels
# ------------------------------------------------------------------------------------------------------------------------- #

# STEP 3: Show original total rows and class balance
print("="*50)
print("📊 BEFORE BALANCING")
print("-" * 50)
print(f"Total rows before balancing: {len(df)}")
print("Class balance before balancing:")
print(df['Email Type'].value_counts())
print("="*50)
df.head(10)

📊 BEFORE BALANCING
--------------------------------------------------
Total rows before balancing: 18634
Class balance before balancing:
Email Type
Safe Email        11322
Phishing Email     7312
Name: count, dtype: int64


Unnamed: 0,Email Text,Email Type,Label
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email,0
1,the other side of * galicismos * * galicismo *...,Safe Email,0
2,re : equistar deal tickets are you still avail...,Safe Email,0
3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email,1
4,software at incredibly low prices ( 86 % lower...,Phishing Email,1
5,global risk management operations sally congra...,Safe Email,0
6,"On Sun, Aug 11, 2002 at 11:17:47AM +0100, wint...",Safe Email,0
7,"entourage , stockmogul newsletter ralph velez ...",Phishing Email,1
8,"we owe you lots of money dear applicant , afte...",Phishing Email,1
9,re : coastal deal - with exxon participation u...,Safe Email,0


In [4]:
# STEP 4: Oversample the minority class (Phishing Email)
df_majority = df[df['Label'] == 0]  # Safe Email
df_minority = df[df['Label'] == 1]  # Phishing Email

df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

df_balanced = pd.concat([df_majority, df_minority_upsampled])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# ------------------------------------------------------------------------------------------------------------------------- #

# STEP 5: Show balanced total rows and class balance
print("\n" + "="*50)
print("📊 AFTER BALANCING (Oversampling)")
print("-" * 50)
print(f"Total rows after balancing: {len(df_balanced)}")
print("Class balance after balancing:")
print(df_balanced['Email Type'].value_counts())
print("="*50)


📊 AFTER BALANCING (Oversampling)
--------------------------------------------------
Total rows after balancing: 22644
Class balance after balancing:
Email Type
Phishing Email    11322
Safe Email        11322
Name: count, dtype: int64


In [5]:
# STEP 6: Display sample data
columns_to_show = ['Email Text', 'Email Type', 'Label']

# Count values
phishing_count = df_balanced[df_balanced['Email Type'] == 'Phishing Email'].shape[0]
safe_count = df_balanced[df_balanced['Email Type'] == 'Safe Email'].shape[0]
total_rows = df_balanced.shape[0]

print("\n" + "="*60)
print("🧾 SAMPLE OF CLEANED & BALANCED DATAFRAME")
print("="*60)
print(f"Total Rows        : {total_rows}")
print(f"Phishing Emails   : {phishing_count}")
print(f"Safe Emails       : {safe_count}")
print("="*60)

# Display samples
print("\n🧪 Cleaned & Balanced DataFrame (First 5 Rows):")
display(df_balanced[columns_to_show].head(5))

print("\n🎯 Phishing Emails (First 5):")
display(df_balanced[df_balanced['Email Type'] == 'Phishing Email'][columns_to_show].head(5))

print("\n✅ Safe Emails (First 5):")
display(df_balanced[df_balanced['Email Type'] == 'Safe Email'][columns_to_show].head(5))




🧾 SAMPLE OF CLEANED & BALANCED DATAFRAME
Total Rows        : 22644
Phishing Emails   : 11322
Safe Emails       : 11322

🧪 Cleaned & Balanced DataFrame (First 5 Rows):


Unnamed: 0,Email Text,Email Type,Label
0,INVESTMENT SCHOLARS CLUB- bringing you the lat...,Phishing Email,1
1,semantics : il dominio tempo-aspettuale il dom...,Safe Email,0
2,mature mom and her young horny lover ! . . woo...,Phishing Email,1
3,do you own a car ; starting december 7 th ford...,Phishing Email,1
4,rescue you from highprice medicaments and badp...,Phishing Email,1



🎯 Phishing Emails (First 5):


Unnamed: 0,Email Text,Email Type,Label
0,INVESTMENT SCHOLARS CLUB- bringing you the lat...,Phishing Email,1
2,mature mom and her young horny lover ! . . woo...,Phishing Email,1
3,do you own a car ; starting december 7 th ford...,Phishing Email,1
4,rescue you from highprice medicaments and badp...,Phishing Email,1
11,New Web Technology\nUNLIMITED WEB CONFERENCING...,Phishing Email,1



✅ Safe Emails (First 5):


Unnamed: 0,Email Text,Email Type,Label
1,semantics : il dominio tempo-aspettuale il dom...,Safe Email,0
5,http://www.bbc.co.uk/radio1/alt/nireland/ni_te...,Safe Email,0
6,Hi Damian.SuSe has a Sparc version I previousl...,Safe Email,0
7,"hpl nom for may 25 , 2001 ( see attached file ...",Safe Email,0
8,iatl 14 : final cfp the 14th annual meeting - ...,Safe Email,0


In [9]:
# STEP 7: Extract Document Features
import re

# Trimmed preview column for readability
df_balanced['Email Preview'] = df_balanced['Email Text'].str.slice(0, 50)

# Feature 1: Character count
df_balanced['char_count'] = df_balanced['Email Text'].apply(len)

# Feature 2: Word count
df_balanced['word_count'] = df_balanced['Email Text'].apply(lambda x: len(x.split()))

# Feature 3: Exclamation mark count
df_balanced['exclamation_count'] = df_balanced['Email Text'].str.count('!')

# Feature 4: Uppercase letter ratio
df_balanced['uppercase_ratio'] = df_balanced['Email Text'].apply(
    lambda x: round(sum(1 for c in x if c.isupper()) / len(x), 3)
)

# Feature 5: Contains a link (http/https or www)
df_balanced['has_link'] = df_balanced['Email Text'].str.contains(
    r"http[s]?://|www\.", regex=True).astype(int)

# Feature 6: Contains login-related keywords
df_balanced['has_login_word'] = df_balanced['Email Text'].str.contains(
    r"login|password|verify|account", case=False, regex=True).astype(int)

# Feature 7: Contains HTML tags
df_balanced['has_html'] = df_balanced['Email Text'].str.contains(
    r"<[^>]+>", regex=True).astype(int)

# Remove any accidentally duplicated columns (if Step 7 was run twice)
df_balanced = df_balanced.loc[:, ~df_balanced.columns.duplicated()]

# Confirm structure
print("\n🧠 Sample Extracted Features (First 5):")
feature_cols = ['Email Preview', 'char_count', 'word_count', 'exclamation_count',
                'uppercase_ratio', 'has_link', 'has_login_word', 'has_html']
display(df_balanced[feature_cols].head(5))

print("\n✅ Final Columns:")
print(df_balanced.columns.tolist())



🧠 Sample Extracted Features (First 5):


Unnamed: 0,Email Preview,char_count,word_count,exclamation_count,uppercase_ratio,has_link,has_login_word,has_html
0,INVESTMENT SCHOLARS CLUB- bringing you the lat...,3893,586,2,0.034,0,1,0
1,semantics : il dominio tempo-aspettuale il dom...,904,154,0,0.0,0,0,0
2,mature mom and her young horny lover ! . . woo...,664,141,1,0.0,0,0,0
3,do you own a car ; starting december 7 th ford...,792,170,0,0.0,0,0,0
4,rescue you from highprice medicaments and badp...,796,163,0,0.0,0,0,0



✅ Final Columns:
['Email Text', 'Email Type', 'Label', 'Email Preview', 'char_count', 'word_count', 'exclamation_count', 'uppercase_ratio', 'has_link', 'has_login_word', 'has_html']
