In [1]:
from spam_email_patterns import adult_content_patterns, patterns
import pandas as pd
import re

In [2]:
df = pd.read_csv("./Datasets/merged_spam_dataset.csv")

In [3]:
# Function to determine data type
def get_type(value):
    if pd.isna(value):
        return "NoneType"
    try:
        int_val = int(value)
        return "int"
    except ValueError:
        try:
            float_val = float(value)
            return "float"
        except ValueError:
            return "str"

In [4]:
# Apply to specific column
df["text_type"] = df['text'].apply(get_type)

# Count types
type_counts = df["text_type"].value_counts()

type_counts

text_type
str         95729
NoneType       17
int             3
Name: count, dtype: int64

In [5]:
df = df[df["text_type"] == "str"]

In [6]:
def classify_scam(text):
    # Check for adult content patterns
    for pattern in adult_content_patterns:
        try:
            if re.search(pattern, text, re.IGNORECASE):
                return 1  # Spam
        except:
            print(f"Error in pattern {pattern}")
            import traceback
            raise TypeError(traceback.format_exc())
    return 0  # Not spam

In [7]:
df["is_scam"] = df["text"].apply(classify_scam)
df[df["is_scam"] == 1]

Unnamed: 0.1,Unnamed: 0,source,text,label,subject,text_type,is_scam
7,7,enron.csv,there is this costless dating website packed w...,1,find sex - addicts in your area,str,1
12,12,enron.csv,this is a great dating site ! ! !\r\nplease re...,1,looking for love ? tonight ?,str,1
16,16,enron.csv,the only fix to penis growth\r\nlimited time o...,1,,str,1
19,19,enron.csv,here ' s a special offer for you . . .\r\nwant...,1,"attract the opposite sex , the ultimate guide ...",str,1
20,20,enron.csv,"how ' re you doing there , paliourg\r\ncialis ...",1,health order for paliourg,str,1
...,...,...,...,...,...,...,...
95499,95499,fraud_detect.csv,You are being contacted by our Dating Service ...,spam,,str,1
95527,95527,fraud_detect.csv,Someone U know has asked our dating service 2 ...,spam,,str,1
95647,95647,fraud_detect.csv,Want explicit SEX in 30 secs? Ring 02073162414...,spam,,str,1
95702,95702,fraud_detect.csv,"Mila, age23, blonde, new in UK. I look sex wit...",spam,,str,1


In [8]:
df.iloc[7]["text"]

'there is this costless dating website packed with countless sexoholics\r\nno gifts , no walks on the beach , just meetup to get laid : )\r\nthere are also some who want something more serious though\r\nso if you want a one night stand , or a long term relationship , you got it ; )\r\nwhatever floats your boat pretty much !\r\nhttp : / / www . cumherecomic . net\r\n'

In [9]:
def classify_scam_test(text):
    # Check for adult content patterns
    if re.search(adult_content_patterns[2], text):
        return 1  # Spam
    return 0  # Not spam

In [10]:
df["temp_value"] = df["text"].apply(classify_scam_test)
print(df[df["temp_value"] == 1])
df.drop(columns=["temp_value"], inplace=True)

       Unnamed: 0            source  \
19             19         enron.csv   
20             20         enron.csv   
22             22         enron.csv   
74             74         enron.csv   
83             83         enron.csv   
...           ...               ...   
95386       95386  fraud_detect.csv   
95479       95479  fraud_detect.csv   
95647       95647  fraud_detect.csv   
95702       95702  fraud_detect.csv   
95725       95725  fraud_detect.csv   

                                                    text label  \
19     here ' s a special offer for you . . .\r\nwant...     1   
20     how ' re you doing there , paliourg\r\ncialis ...     1   
22     good morning sir ,\r\ncheck out the discounts ...     1   
74     extra - time - last 5 - 10 times longer !\r\nt...     1   
83     how does viagra professional work ?\r\nviagra ...     1   
...                                                  ...   ...   
95386  Want explicit SEX in 30 secs? Ring 02073162414...  spam   
954

In [11]:
print({category: 0 for category in patterns.keys()})

{'Adult Content': 0, 'Lottery Scam': 0, 'Financial Fraud': 0, 'Advertisement': 0, 'Phishing': 0, 'Malware': 0}


In [12]:
def classify_spam_single_label(text):
    text = str(text).lower()
    scores = {category: 0 for category in patterns.keys()}
    
    # Count keyword matches for each category
    for category, sub_patterns in patterns.items():
        for pattern in sub_patterns:
            if re.search(pattern, text):
                # Increment the score for the category if a keyword is found
                scores[category] += 1
    
    # Find the category with the highest score
    max_score = 0
    best_category = 'Other Spam'
    
    for category, score in scores.items():
        if score > max_score:
            max_score = score
            best_category = category
    
    return best_category

In [13]:
# Apply classification and create new column
df['spam_type'] = df['text'].apply(classify_spam_single_label)

# View distribution
print(df['spam_type'].value_counts())

spam_type
Financial Fraud    49039
Other Spam         24596
Advertisement      13351
Adult Content       4983
Phishing            1337
Malware             1222
Lottery Scam        1201
Name: count, dtype: int64


In [14]:
malware_df = df[df["spam_type"] == "Malware"]
adult_content_df = df[df["spam_type"] == "Adult Content"]
lottery_scam_df = df[df["spam_type"] == "Lottery Scam"]
financial_fraud_df = df[df["spam_type"] == "Financial Fraud"]
advertisement_df = df[df["spam_type"] == "Advertisement"]
phishing_df = df[df["spam_type"] == "Phishing"]

In [15]:
df[df["spam_type"] == "Other Spam"].to_csv("./other_spam_emails.csv", index=False)