In [1]:
import pandas as pd
import numpy as np

# Load the data

In [2]:
spam_data = pd.read_csv('C:/Users/tonba/OneDrive/Desktop/Machine Learning Projects/datasets/spam filter dataset/emails.csv')

# Drops rows that have missing values
spam_data.dropna(axis=0, inplace=True)

# Converts the target variable data type to int type
spam_data['target'] = spam_data['target'].astype(int)

spam_data.head(10)

Unnamed: 0,text,target
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
5,"Subject: great nnews hello , welcome to medzo...",1
6,Subject: here ' s a hot play in motion homela...,1
7,Subject: save your money buy getting this thin...,1
8,Subject: undeliverable : home based business f...,1
9,Subject: save your money buy getting this thin...,1


# Let's look at some of the comments

In [3]:
spam_data.iloc[2]['text']

'Subject: unbelievable new homes made easy  im wanting to show you this  homeowner  you have been pre - approved for a $ 454 , 169 home loan at a 3 . 72 fixed rate .  this offer is being extended to you unconditionally and your credit is in no way a factor .  to take advantage of this limited time opportunity  all we ask is that you visit our website and complete  the 1 minute post approval form  look foward to hearing from you ,  dorcas pittman'

# Count the number of spam and non-spam emails

In [4]:
spam_data['target'].value_counts()

0    4358
1    1368
Name: target, dtype: int64

This is an imbalanced dataset. We use the AUC metric for model evaluation.

# Comparison of average lengths for spam and non-spam emails

In [5]:
spam_data['length'] = spam_data['text'].apply(len)
spam_data.head()

Unnamed: 0,text,target,length
0,Subject: naturally irresistible your corporate...,1,1484
1,Subject: the stock trading gunslinger fanny i...,1,598
2,Subject: unbelievable new homes made easy im ...,1,448
3,Subject: 4 color printing special request add...,1,500
4,"Subject: do not have money , get software cds ...",1,235


In [6]:
spam_data.groupby(by='target').mean()

  spam_data.groupby(by='target').mean()


Unnamed: 0_level_0,length
target,Unnamed: 1_level_1
0,1612.499771
1,1317.25731


# Split the data into test and train sets

In [7]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], 
                                                    spam_data['target'], 
                                                    random_state=0)

# Gaussian Naive Bayes with CountVectorizer feature extraction

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score

def count_vectorizer_method():
    # Apply count vectorizer to each email
    vect = CountVectorizer(stop_words={'english'}).fit(X_train)
    X_train_vectorized = vect.transform(X_train)
    
    # Calls the Gaussian Naive Bayes 
    model = MultinomialNB(alpha=.1)
    model.fit(X_train_vectorized.toarray(), y_train)
    
    # Make predictions
    predictions = model.predict(vect.transform(X_test).toarray())
    
    # AUC score
    model_auc_score = roc_auc_score(y_test, predictions)

    return model_auc_score

In [9]:
count_vectorizer_method()

0.9865338577291382

# TF-IDF feature extraction

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

def TF_IDF_vectorizer_method():
    # TF-IDF vecotrizer. Min_df is a threshold value for ignoring words that have a count of less than the given min_df value.
    vect = TfidfVectorizer(min_df=3, stop_words={'english'}).fit(X_train)
    X_train_vectorized = vect.transform(X_train)
    
    # Multinomial Naive Bayes
    model = MultinomialNB(alpha=.01)
    model.fit(X_train_vectorized, y_train)
    
    # Make predictions
    predictions = model.predict(vect.transform(X_test))
    
    # AUC score
    auc_score = roc_auc_score(y_test, predictions)
    
    return auc_score

In [11]:
TF_IDF_vectorizer_method()

0.9772785567715458