In [172]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re
import string
from unidecode import unidecode
import emoji

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OlegKashurin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\OlegKashurin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\OlegKashurin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Data loading

In [236]:
file_train = 'train_spam.csv'

data_train_all = pd.read_csv(file_train)

In [237]:
texts_train_all = data_train_all['text']
labels_all = data_train_all['text_type']
target_all = pd.Categorical(labels_all, categories=['ham', 'spam']).codes

In [238]:
print(texts_train_all[60:160].to_numpy())

['gud gudk chikku tke care sleep well gud nyt'
 'sure but since my parents will be working on tuesday i dont really need a cover story'
 'url url date 1635465 1635465 1635465t1635465 1635465 1635465 1635465 1635465 the observer profile brazil goes to the polls today and all the indications are that its 1635465 million voters will elect a man who once sold peanuts on the street luis lula da silva'
 'that said can you text him one more time'
 '🎉🎊 channel for sell 🎉🎊 ━━━━━━━━━━━━━━━━━ 🎉🎊 100 sale 🎉🎊 ━━━━━━━━━━━━━━━━━ group link members78kmembers price 500 ━━━━━━━━━━━━━━━━━ contact me @lakshyaytm if u are limited @lakshyasbot ━━━━━━━━━━━━━━━━━ first payment then owner'
 'hello i need to sell my op id ani one need dm me fast fast'
 'message was flagged as spam devs will use this to improve spam protection algorithm'
 'lsu seminar visit jim i can send you copies of the reprints of some papers i wrote or co authored please let me know how many copies do you need i shall prepare power point pr

## Preprocessing 1

In [217]:
def remove_hyperlink(word):
    return  re.sub(r"http\S+", "", word)

def to_lower(word):
    result = word.lower()
    return result

def remove_punctuation(word):
    # result = word.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    punctuation_pattern = r'[^\w\s]'
    return re.sub(punctuation_pattern, ' ', word)

def emoji_to_text(word):
    return emoji.demojize(word)

def remove_whitespace(word):
    result = word.strip()
    return result

def replace_newline(word):
    return word.replace('\n', '')

def remove_number(word, keyword=' NUMBER '):
    result = re.sub(r'\b\w*\d\w*\b', keyword, word)
    return result

def remove_currency(word, keyword=' CURRENCY '):
    currency_pattern = r'[£$€₹]'
    return re.sub(currency_pattern, keyword, word)

In [218]:
def preprocess_pipeline(sentence):
    preprocess_utils = [
                        remove_hyperlink,
                        replace_newline,
                        to_lower,
                        emoji_to_text,
                        remove_currency,
                        remove_number,
                        remove_punctuation,
                        remove_whitespace
                    ]
    for func in preprocess_utils:
        sentence = func(sentence)
    return sentence

In [219]:
data_train_preprocess = [
    preprocess_pipeline(sent) for sent in texts_train_all.to_numpy()
]

In [220]:
data_train_preprocess[:30]

['make sure alex knows his birthday is over in fifteen minutes as far as youre concerned',
 'a resume for john lavorato thanks vince i will get moving on it right away molly vince j kaminski  NUMBER   NUMBER   NUMBER   NUMBER   NUMBER  pm to molly magee hou ect ect cc vince j kaminski hou ect ect subject a resume for john lavorato molly please make arrangements for the interview with this candidate for a trading position interviews with john lavorato jeff shankman gary hickerson stinson gibner i talked to him in new york and he is considering other opportunities so we have to act fast i think john will like him more than punit thanks',
 'plzz visit my website moviesgodml to get all movies for free and also i provide direct download links no redirect and ads smiling_face_with_smiling_eyes  smiling_face_with_smiling_eyes  smiling_face_with_smiling_eyes  smiling_face_with_smiling_eyes  beaming_face_with_smiling_eyes',
 'urgent your mobile number has been awarded with a  CURRENCY  NUMBER  

In [221]:
lemmatizer = WordNetLemmatizer()

tokenizer = TweetTokenizer()

ENGLISH_STOP_WORDS = set(stopwords.words('english'))

In [222]:
def remove_stop_words(words):
    result = [i for i in words if i not in ENGLISH_STOP_WORDS]
    return result

def word_lemmatizer(words):
    return [lemmatizer.lemmatize(s) for s in words]

In [223]:
def tokenize_pipeline(sentence):
    words = tokenizer.tokenize(sentence)
    tokenize_utils = [remove_stop_words, word_lemmatizer]
    for func in tokenize_utils:
        words = func(words)
    return words

In [224]:
data_train_tokenized = [
    tokenize_pipeline(sent) for sent in data_train_preprocess
]

In [225]:
print(data_train_tokenized[2])

['plzz', 'visit', 'website', 'moviesgodml', 'get', 'movie', 'free', 'also', 'provide', 'direct', 'download', 'link', 'redirect', 'ad', 'smiling_face_with_smiling_eyes', 'smiling_face_with_smiling_eyes', 'smiling_face_with_smiling_eyes', 'smiling_face_with_smiling_eyes', 'beaming_face_with_smiling_eyes']


In [226]:
text = '𝑰𝒕𝒔 𝒕𝒉𝒆 𝒔𝒆𝒄𝒐𝒏𝒅 𝒕𝒊𝒎𝒆 𝒐𝒇 𝒓𝒆𝒄𝒆𝒊𝒗𝒊𝒏𝒈 𝒎𝒚 𝒑𝒓𝒐𝒇𝒊𝒕 𝒇𝒓𝒐𝒎 𝒕𝒉𝒊𝒔 𝒊𝒏𝒗𝒆𝒔𝒕𝒎𝒆𝒏𝒕 𝒑𝒍𝒂𝒕𝒇𝒐𝒓𝒎 𝒂𝒏𝒅 𝑰 𝒏𝒆𝒆𝒅 𝒕𝒐 𝒕𝒆𝒔𝒕𝒊𝒇𝒚 𝒕𝒐 𝒚𝒐𝒖 𝒕𝒉𝒂𝒕 𝑰 𝒋𝒖𝒔𝒕 𝒓𝒆𝒄𝒆𝒊𝒗𝒆𝒅'
text = preprocess_pipeline(text)
text = tokenize_pipeline(text)
print(text)

['𝑰𝒕𝒔', '𝒕𝒉𝒆', '𝒔𝒆𝒄𝒐𝒏𝒅', '𝒕𝒊𝒎𝒆', '𝒐𝒇', '𝒓𝒆𝒄𝒆𝒊𝒗𝒊𝒏𝒈', '𝒎𝒚', '𝒑𝒓𝒐𝒇𝒊𝒕', '𝒇𝒓𝒐𝒎', '𝒕𝒉𝒊𝒔', '𝒊𝒏𝒗𝒆𝒔𝒕𝒎𝒆𝒏𝒕', '𝒑𝒍𝒂𝒕𝒇𝒐𝒓𝒎', '𝒂𝒏𝒅', '𝑰', '𝒏𝒆𝒆𝒅', '𝒕𝒐', '𝒕𝒆𝒔𝒕𝒊𝒇𝒚', '𝒕𝒐', '𝒚𝒐𝒖', '𝒕𝒉𝒂𝒕', '𝑰', '𝒋𝒖𝒔𝒕', '𝒓𝒆𝒄𝒆𝒊𝒗𝒆𝒅']


In [227]:
texts_train, texts_val, y_train, y_val = train_test_split(
                                        data_train_tokenized,
                                        target_all,
                                        shuffle=True,
                                        random_state=42, 
                                        stratify=target_all)

## Feature extraction

In [228]:
texts_train_strings = [" ".join(sentence) for sentence in texts_train]

vectorizer = TfidfVectorizer(lowercase=False)
vectorizer.fit(texts_train_strings)

In [229]:
def convert_to_feature(data):
    data_strings = [" ".join(sentence) for sentence in data]
    return vectorizer.transform(data_strings)

In [230]:
X_train = convert_to_feature(texts_train)
X_val = convert_to_feature(texts_val)
print(X_train.shape, X_val.shape)

(12208, 38594) (4070, 38594)


## Training naive bayes model

In [231]:
clf = GaussianNB()

In [232]:
clf.fit(X_train.toarray(), y_train)

In [233]:
y_pred = clf.predict(X_val.toarray())

In [234]:
def calc_metrics(y_true, y_pred):
    return (
        accuracy_score(y_true, y_pred),
        precision_score(y_true, y_pred),
        recall_score(y_true, y_pred),
        f1_score(y_true, y_pred),
        roc_auc_score(y_true, y_pred)
    )

In [235]:
print(calc_metrics(y_val, y_pred))

(0.7788697788697788, 0.5875870069605569, 0.8427620632279534, 0.6924128503075873, 0.7974270567185793)


In [213]:
print(calc_metrics(y_val, y_pred))

(0.8012285012285012, 0.6183022275737508, 0.8544093178036606, 0.7174292699965072, 0.8166746728488318)


In [175]:
print(calc_metrics(y_val, y_pred))

(0.7791154791154791, 0.5878260869565217, 0.8435940099833611, 0.6928595831909805, 0.797843030096283)
