In [7]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer


In [15]:
# Read Data for the Fraudulent Email Kaggle Challenge
df = pd.read_csv("kg_train.csv", encoding='latin-1')

# Reduce the training set to speed up development
df = df.head(1000)
print(df.shape)
df.fillna("", inplace=True)

(1000, 2)


In [29]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from scipy.sparse import hstack


In [21]:
# Divide the dataset into training and test sets
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)


In [30]:
# Download the stopwords from NLTK
nltk.download('stopwords', quiet=True)

# Ensure the stopwords resource is available
try:
    stop_words = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))


In [31]:
# Initialize stemmer
snowball = SnowballStemmer('english')

# Function for preprocessing text
def preprocess_text(text):
    # Remove inline JavaScript/CSS
    text = re.sub(r'<(script|style).*?>.*?</\1>', '', text, flags=re.DOTALL)
    # Remove HTML comments
    text = re.sub(r'<!--.*?-->', '', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove single characters
    text = re.sub(r'\b[a-zA-Z]\b', '', text)
    # Substitute multiple spaces with single space
    text = re.sub(r'\s+', ' ', text).strip()
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    # Stemming
    text = ' '.join(snowball.stem(word) for word in text.split())
    return text

# Apply preprocessing
train_df['preprocessed_text'] = train_df['text'].apply(preprocess_text)
test_df['preprocessed_text'] = test_df['text'].apply(preprocess_text)


In [None]:
# Feature Extraction Adding Extra Features

In [34]:
money_simbol_list = "|".join(["euro", "dollar", "pound", "€", "$"])
suspicious_words = "|".join(["free", "cheap", "sex", "money", "account", "bank", "fund", "transfer", "transaction", "win", "deposit", "password"])

train_df['money_mark'] = train_df['preprocessed_text'].str.contains(money_simbol_list)*1
train_df['suspicious_words'] = train_df['preprocessed_text'].str.contains(suspicious_words)*1
train_df['text_len'] = train_df['preprocessed_text'].apply(lambda x: len(x))

test_df['money_mark'] = test_df['preprocessed_text'].str.contains(money_simbol_list)*1
test_df['suspicious_words'] = test_df['preprocessed_text'].str.contains(suspicious_words)*1
test_df['text_len'] = test_df['preprocessed_text'].apply(lambda x: len(x))

train_df.head()


Unnamed: 0,text,label,preprocessed_text,money_mark,suspicious_words,text_len
29,"----------- REGARDS, MR NELSON SMITH.KINDLY RE...",1,regard mr nelson smithkind repli privat email ...,1,0,72
535,I have not been able to reach oscar this am. W...,0,abl reach oscar suppos send pdb receiv,1,0,38
695,; Huma Abedin B6I'm checking with Pat on the 5...,0,huma abedin bim check pat work jack jake resta...,1,0,76
557,I can have it announced here on Monday - can't...,0,announc monday cant today,1,0,25
836,BANK OF AFRICAAGENCE SAN PEDRO14 BP 1210 S...,1,bank africaag san pedro bp san pedro cote divo...,1,1,959


In [36]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_df['preprocessed_text'])
X_test_counts = count_vect.transform(test_df['preprocessed_text'])

print(X_train_counts.shape)
print(X_test_counts.shape)


(800, 14939)
(200, 14939)


In [38]:
# Initialize TFIDF Vectorizer
tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(train_df['preprocessed_text'])
X_test_tfidf = tfidf_vect.transform(test_df['preprocessed_text'])

print(X_train_tfidf.shape)
print(X_test_tfidf.shape)


(800, 14939)
(200, 14939)


In [39]:
# Train a MultinomialNB classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, train_df['label'])

# Predict on test data
predictions = clf.predict(X_test_tfidf)

# Evaluate the classifier
print(classification_report(test_df['label'], predictions))
print('Accuracy:', accuracy_score(test_df['label'], predictions))


              precision    recall  f1-score   support

           0       0.99      0.87      0.93       125
           1       0.82      0.99      0.90        75

    accuracy                           0.92       200
   macro avg       0.91      0.93      0.91       200
weighted avg       0.93      0.92      0.92       200

Accuracy: 0.915


In [None]:
#Extra Task - Implement a SPAM/HAM Classifier

In [43]:
# Since we have to use MultinomialNB with default parameters, we focus on improving the feature representation.
# Combining features with TFIDF

# Combine the additional features with the TFIDF vectorized features
from scipy.sparse import hstack

extra_features_train = train_df[['money_mark', 'suspicious_words', 'text_len']].values
extra_features_test = test_df[['money_mark', 'suspicious_words', 'text_len']].values

X_train_combined = hstack([X_train_tfidf, extra_features_train])
X_test_combined = hstack([X_test_tfidf, extra_features_test])

# Train and evaluate the classifier with the combined features
clf = MultinomialNB()
clf.fit(X_train_combined, train_df['label'])

# Predict on test data
predictions_combined = clf.predict(X_test_combined)

# Evaluate the classifier
print(classification_report(test_df['label'], predictions_combined))
print('Accuracy:', accuracy_score(test_df['label'], predictions_combined))


              precision    recall  f1-score   support

           0       0.99      0.64      0.78       125
           1       0.62      0.99      0.76        75

    accuracy                           0.77       200
   macro avg       0.80      0.81      0.77       200
weighted avg       0.85      0.77      0.77       200

Accuracy: 0.77
