In [1]:
import numpy as np
import pandas as pd

#import SKLEARN
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

#import JCOPML
from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

from luwiji.text_proc import illustration

In [2]:
#import nltk token for punc and Indonesian StopWords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

Stopwords_indonesia = stopwords.words('indonesian') + list(punctuation)

# Import Data 

In [3]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,1
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,1
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",1
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",1
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,1


Source Data: Rahmi, F. and Wibisono, Y. (2016). Aplikasi SMS Spam Filtering pada Android menggunakan Naive Bayes, Unpublished manuscript.

#  Dataset Splitting

In [4]:
# split X y for training and testing to avoid data leakage
X = df.Teks
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((914,), (229,), (914,), (229,))

# Training

In [5]:
#Train Model with XGBoost and Cross Validation with Randomized Search
#Randomized Search Hyperparameters with JCOPML
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from jcopml.tuning import random_search_params as rsp

In [7]:
pipeline = Pipeline([
    ('prep', TfidfVectorizer(tokenizer=word_tokenize, stop_words=Stopwords_indonesia)),
    ('algo', XGBClassifier(n_jobs=-1, random_state=42))
])

model = RandomizedSearchCV(pipeline, rsp.xgb_params, cv=3, n_iter=50, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   52.2s finished
  'stop_words.' % sorted(inconsistent))


{'algo__colsample_bytree': 0.6918516030703091, 'algo__gamma': 1, 'algo__learning_rate': 0.1216411342539395, 'algo__max_depth': 3, 'algo__n_estimators': 185, 'algo__reg_alpha': 0.011502956321912733, 'algo__reg_lambda': 0.009461469059966103, 'algo__subsample': 0.7865052773762229}
0.9715536105032823 0.9124784296807592 0.9606986899563319


# Predict

In [8]:
#input your text
text = ['Is This Fake?']

In [9]:
model.predict(text)
if model.predict(text) == 1:
    print(f'FAKE! {model.predict_proba(text)}')
else:
    print(f'REAL! {model.predict_proba(text)}')

REAL! [[0.8804764  0.11952356]]
