In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re
import string
from unidecode import unidecode
import emoji

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OlegKashurin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\OlegKashurin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\OlegKashurin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Data loading

In [5]:
file_train = 'train_spam.csv'

data_train_all = pd.read_csv(file_train)

In [6]:
texts_train_all = data_train_all['text']
labels_all = data_train_all['text_type']
target_all = pd.Categorical(labels_all, categories=['ham', 'spam']).codes

In [8]:
print(texts_train_all[300:320].to_numpy())

['development of a program in econo physics hello shirley i d understood from yannis that he d proposed a brown bag lunch where i d give a talk and then the discussion about various possibilities would follow i m here until about mid may and then will go on leave in europe for a year i could be available to make trips back and forth from time to time to get things started best regards joe mccauley'
 'at 1635465 1635465 pm 1635465 on 1635465 1635465 1635465 mr fork wrote i realize now that after reviewing the past several years of work and career i have been in the wrong business the wrong business this is what i should have been doing url a new manual has been published url or the one i read http http search barnesandnoble com booksearch isbninquiry asp userid 1635465t1635465jtegkqp isbn 1635465 and of course the open source versions url url r a hettinga mailto rah ibuc com the internet bearer underwriting corporation url 1635465 farquhar street boston ma 1635465 usa however it may des

## Preprocessing

In [9]:
def remove_hyperlink(word):
    return  re.sub(r"http\S+", "", word)

def to_lower(word):
    result = word.lower()
    return result

def remove_punctuation(word):
    # result = word.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    punctuation_pattern = r'[^\w\s]'
    return re.sub(punctuation_pattern, ' ', word)

def emoji_to_text(word):
    return emoji.demojize(word)

def remove_whitespace(word):
    result = word.strip()
    return result

def replace_newline(word):
    return word.replace('\n', '')

def remove_number(word, keyword=' NUMBER '):
    result = re.sub(r'\b\w*\d\w*\b', keyword, word)
    return result

def remove_currency(word, keyword=' CURRENCY '):
    currency_pattern = r'[£$€₹]'
    return re.sub(currency_pattern, keyword, word)

In [30]:
def preprocess_pipeline(sentence):
    preprocess_utils = [
                        remove_hyperlink,
                        replace_newline,
                        to_lower,
                        emoji_to_text,
                        remove_currency,
                        remove_number,
                        unidecode,
                        remove_punctuation,
                        remove_whitespace
                    ]
    for func in preprocess_utils:
        sentence = func(sentence)
    return sentence

In [31]:
data_train_preprocess = [
    preprocess_pipeline(sent) for sent in texts_train_all.to_numpy()
]

In [32]:
data_train_preprocess[:20]

['make sure alex knows his birthday is over in fifteen minutes as far as youre concerned',
 'a resume for john lavorato thanks vince i will get moving on it right away molly vince j kaminski  NUMBER   NUMBER   NUMBER   NUMBER   NUMBER  pm to molly magee hou ect ect cc vince j kaminski hou ect ect subject a resume for john lavorato molly please make arrangements for the interview with this candidate for a trading position interviews with john lavorato jeff shankman gary hickerson stinson gibner i talked to him in new york and he is considering other opportunities so we have to act fast i think john will like him more than punit thanks',
 'plzz visit my website moviesgodml to get all movies for free and also i provide direct download links no redirect and ads smiling_face_with_smiling_eyes  smiling_face_with_smiling_eyes  smiling_face_with_smiling_eyes  smiling_face_with_smiling_eyes  beaming_face_with_smiling_eyes',
 'urgent your mobile number has been awarded with a  CURRENCY  NUMBER  

In [33]:
lemmatizer = WordNetLemmatizer()

tokenizer = TweetTokenizer()

ENGLISH_STOP_WORDS = set(stopwords.words('english'))

In [34]:
def remove_stop_words(words):
    result = [i for i in words if i not in ENGLISH_STOP_WORDS]
    return result

def word_lemmatizer(words):
    return [lemmatizer.lemmatize(s) for s in words]

In [35]:
def tokenize_pipeline(sentence):
    words = tokenizer.tokenize(sentence)
    tokenize_utils = [remove_stop_words, word_lemmatizer]
    for func in tokenize_utils:
        words = func(words)
    return words

In [36]:
data_train_tokenized = [
    tokenize_pipeline(sent) for sent in data_train_preprocess
]

In [37]:
print(data_train_tokenized[2])

['plzz', 'visit', 'website', 'moviesgodml', 'get', 'movie', 'free', 'also', 'provide', 'direct', 'download', 'link', 'redirect', 'ad', 'smiling_face_with_smiling_eyes', 'smiling_face_with_smiling_eyes', 'smiling_face_with_smiling_eyes', 'smiling_face_with_smiling_eyes', 'beaming_face_with_smiling_eyes']


In [42]:
text = '𝑰𝒕𝒔 𝒕𝒉𝒆 𝒔𝒆𝒄𝒐𝒏𝒅 𝒕𝒊𝒎𝒆 𝒐𝒇 𝒓𝒆𝒄𝒆𝒊𝒗𝒊𝒏𝒈 𝒎𝒚 𝒑𝒓𝒐𝒇𝒊𝒕 𝒇𝒓𝒐𝒎 𝒕𝒉𝒊𝒔 𝒊𝒏𝒗𝒆𝒔𝒕𝒎𝒆𝒏𝒕 𝒑𝒍𝒂𝒕𝒇𝒐𝒓𝒎 𝒂𝒏𝒅 𝑰 𝒏𝒆𝒆𝒅 𝒕𝒐 𝒕𝒆𝒔𝒕𝒊𝒇𝒚 𝒕𝒐 𝒚𝒐𝒖 𝒕𝒉𝒂𝒕 𝑰 𝒋𝒖𝒔𝒕 𝒓𝒆𝒄𝒆𝒊𝒗𝒆𝒅'
text = preprocess_pipeline(text)
text = tokenize_pipeline(text)
print(text)

['Its', 'second', 'time', 'receiving', 'profit', 'investment', 'platform', 'I', 'need', 'testify', 'I', 'received']


In [43]:
texts_train, texts_val, y_train, y_val = train_test_split(
                                        data_train_tokenized,
                                        target_all,
                                        shuffle=True,
                                        random_state=42, 
                                        stratify=target_all)

## Feature extraction

In [61]:
texts_train_strings = [" ".join(sentence) for sentence in texts_train]

vectorizer = TfidfVectorizer(lowercase=False, max_features=5000)
vectorizer.fit(texts_train_strings)

In [62]:
def convert_to_feature(data):
    data_strings = [" ".join(sentence) for sentence in data]
    return vectorizer.transform(data_strings)

In [63]:
X_train = convert_to_feature(texts_train)
X_val = convert_to_feature(texts_val)
print(X_train.shape, X_val.shape)

(12208, 5000) (4070, 5000)


## Training naive bayes model

In [64]:
clf = GaussianNB()

In [65]:
clf.fit(X_train.toarray(), y_train)

In [66]:
y_pred = clf.predict(X_val.toarray())

In [67]:
def calc_metrics(y_true, y_pred):
    return (
        accuracy_score(y_true, y_pred),
        precision_score(y_true, y_pred),
        recall_score(y_true, y_pred),
        f1_score(y_true, y_pred),
        roc_auc_score(y_true, y_pred)
    )

In [68]:
print(calc_metrics(y_val, y_pred))

(0.7616707616707616, 0.5591233435270132, 0.9126455906821963, 0.6934260429835651, 0.8055208427609031)


In [60]:
print(calc_metrics(y_val, y_pred))

(0.7619164619164619, 0.5618037135278514, 0.8810316139767055, 0.6861030126336248, 0.7965130175880738)


In [41]:
print(calc_metrics(y_val, y_pred))

(0.7788697788697788, 0.5875870069605569, 0.8427620632279534, 0.6924128503075873, 0.7974270567185793)
