In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import TweetTokenizer
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import MaxAbsScaler
from utils.get_embeddings import load_df
from utils.text_preprocessing import full_preprocess

In [2]:
df = load_df(os.path.join('data', 'train_spam.csv'))
df

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...
...,...,...
16273,spam,if you are interested in binary options tradin...
16274,spam,dirty pictureblyk on aircel thanks you for bei...
16275,ham,or you could do this g on mon 1635465 sep 1635...
16276,ham,insta reels par 80 ‡§ó‡§Ç‡§¶ bhara pada hai üëÄ kuch b...


In [3]:
df = full_preprocess(df)
df

Unnamed: 0,text_type,text,lemmatized_tokens,preprocessed_text
0,ham,"[make, sure, alex, knows, birthday, fifteen, m...","[make, sure, alex, know, birthday, fifteen, mi...",make sure alex know birthday fifteen minute fa...
1,ham,"[resume, john, lavorato, thanks, vince, get, m...","[resume, john, lavorato, thanks, vince, get, m...",resume john lavorato thanks vince get moving r...
2,spam,"[plzz, visit, website, moviesgodml, get, movie...","[plzz, visit, website, moviesgodml, get, movie...",plzz visit website moviesgodml get movie free ...
3,spam,"[urgent, mobile, number, awarded, ¬£2000, prize...","[urgent, mobile, number, awarded, ¬£2000, prize...",urgent mobile number awarded ¬£2000 prize guara...
4,ham,"[overview, hr, associates, analyst, project, p...","[overview, hr, associate, analyst, project, pe...",overview hr associate analyst project per davi...
...,...,...,...,...
16273,spam,"[interested, binary, options, trading, may, co...","[interested, binary, option, trading, may, con...",interested binary option trading may continue ...
16274,spam,"[dirty, pictureblyk, aircel, thanks, valued, m...","[dirty, pictureblyk, aircel, thanks, valued, m...",dirty pictureblyk aircel thanks valued member ...
16275,ham,"[could, g, mon, 1635465, sep, 1635465, david, ...","[could, g, mon, 1635465, sep, 1635465, david, ...",could g mon 1635465 sep 1635465 david rees wro...
16276,ham,"[insta, reels, par, 80, ‡§ó‡§Ç‡§¶, bhara, pada, hai,...","[insta, reel, par, 80, ‡§ó‡§Ç‡§¶, bhara, pada, hai, ...",insta reel par 80 ‡§ó‡§Ç‡§¶ bhara pada hai üëÄ kuch bh...


In [4]:
df['text_type'] = df['text_type'].map({'ham': 0, 'spam': 1})

In [5]:
def print_metrics(model, X_test, y_test):
    prob = model.predict_proba(X_test)
    pred_labels = model.predict(X_test)
    print(f'ROC-AUC: {roc_auc_score(y_test, prob[:, 1])}\nF1 : {f1_score(y_test, pred_labels)}\nPrecision: {precision_score(y_test, pred_labels)}\nRecall: {recall_score(y_test, pred_labels)}\nAccuracy: {accuracy_score(y_test, pred_labels)}')

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df['preprocessed_text'], df['text_type'], test_size=0.33, random_state=42)

In [7]:
tknzr = TweetTokenizer()
vect = CountVectorizer(ngram_range=(1, 5), tokenizer=tknzr.tokenize)
bow = vect.fit_transform(X_train)
bow_test = vect.transform(X_test)
bow.shape



(10906, 1191804)

In [8]:
scaler = MaxAbsScaler()
bow = scaler.fit_transform(bow)
bow_test = scaler.transform(bow_test)

In [9]:
clf = LogisticRegression(max_iter=10000, random_state=42)
clf.fit(bow, y_train)

In [10]:
print_metrics(clf, bow_test, y_test)

ROC-AUC: 0.9880864880108773
F1 : 0.6991333058192324
Precision: 0.994131455399061
Recall: 0.539147040101846
Accuracy: 0.8642963514519731


In [11]:
vect2 = TfidfVectorizer(ngram_range=(1, 3), tokenizer=tknzr.tokenize, max_df=0.9, min_df=1e-4)
bow2 = vect2.fit_transform(X_train)
bow2_test = vect2.transform(X_test)
bow2.shape



(10906, 95428)

In [12]:
scaler = MaxAbsScaler()
bow2 = scaler.fit_transform(bow2)
bow2_test = scaler.transform(bow2_test)

In [13]:
clf = LogisticRegression(max_iter=10000, random_state=42)
clf.fit(bow2, y_train)

In [14]:
print_metrics(clf, bow2_test, y_test)

ROC-AUC: 0.9892021447001033
F1 : 0.9076305220883534
Precision: 0.9569513055751588
Recall: 0.8631444939528963
Accuracy: 0.9486224869694714


–° TF-IDF –ø–æ–ª—É—á–∏–ª–∏—Å—å –≤–ø–æ–ª–Ω–µ —Å–µ–±–µ –≥–æ–¥–Ω—ã–µ –º–µ—Ç—Ä–∏–∫–∏