In [200]:
import numpy as np 
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from xgboost import XGBClassifier

In [201]:
train_data = pd.read_csv("../input/data-worthcheck-nlp/data_worthcheck/train.csv")
test_data = pd.read_csv("../input/data-worthcheck-nlp/data_worthcheck/test.csv")

<h1> Preprocess </h1>

In [202]:
train_data["label"] = train_data["label"].map({'no': 0, 'yes':1})
test_data["label"] = test_data["label"].map({'no': 0, 'yes':1})

In [203]:
print("TRAIN DATA")
len_train_0 = len(train_data[train_data["label"] == 0])
len_train_1 = len(train_data[train_data["label"] == 1])
len_train = len(train_data)
print("NO: ", len_train_0)
print("YES: ", len_train_1)
print("NO + YES = ", len_train_0 + len_train_1)
print("TOTAL: ", len_train)

TRAIN DATA
NO:  15512
YES:  6089
NO + YES =  21601
TOTAL:  21601


In [204]:
print("TEST DATA")
len_test_0 = len(test_data[test_data["label"] == 0])
len_test_1 = len(test_data[test_data["label"] == 1])
len_test = len(test_data)
print("NO: ", len_test_0)
print("YES: ", len_test_1)
print("NO + YES = ", len_test_0 + len_test_1)
print("TOTAL: ", len_test)

TEST DATA
NO:  2093
YES:  707
NO + YES =  2800
TOTAL:  2800


In [205]:
test_data.head()

Unnamed: 0,text_a,label
0,jek dajal ga depok bang,0
1,detikcom untung depok masuk wilayah nya ridwan...,0
2,df dom jakarta depok yg gunain vc cabang nya c...,0
3,your2rl depok jkt,0
4,doakan indonesia selamat virus corona pkb depo...,1


In [206]:
train_data = train_data.iloc[:, 1:]

In [207]:
train_data.head()

Unnamed: 0,text_a,label
0,betewe buka twitter cuman ngetweet liat home b...,0
1,mas piyuuu mugo2 corona tuh mulut tersumpal ma...,0
2,e100ss gini buka informasi sejelas nya identit...,1
3,neng solo wes ono terduga corona cobo neng ati...,0
4,midiahn nii akun gak takut takut nya isu coron...,0


In [208]:
print("START")
indonesian_stopwords = stopwords.words('indonesian')
train_stop_removed = []
test_stop_removed = []

for sentence in train_data["text_a"]:
    sentence_stop_removed = [word for word in sentence.split(" ") if word not in indonesian_stopwords]
    sentence_stop_removed = " ".join(sentence_stop_removed)
    train_stop_removed.append(sentence_stop_removed)

for sentence in test_data["text_a"]:
    sentence_stop_removed = [word for word in sentence.split(" ") if word not in indonesian_stopwords]
    sentence_stop_removed = " ".join(sentence_stop_removed)
    test_stop_removed.append(sentence_stop_removed)

print("FINISHED")

START
FINISHED


In [209]:
train_data["text_a"] = train_stop_removed
test_data["text_a"] = test_stop_removed

<h3> CountVectorizer </h3>

In [210]:
cv = CountVectorizer(lowercase=True)
# cv = CountVectorizer(lowercase=True, min_df=0.01)
# cv = CountVectorizer(lowercase=True, ngram_range=(2,3))
# cv = CountVectorizer(lowercase=True, binary=True, ngram_range=(2,3))
cv.fit(train_data["text_a"])
train_data_vector_cv = cv.transform(train_data["text_a"])
test_data_vector_cv = cv.transform(test_data["text_a"])

<h3> TF-IDF </h3>

In [211]:

tfidf = TfidfVectorizer(lowercase=True)
# tfidf = TfidfVectorizer(lowercase=True, min_df=0.01)
# tfidf = TfidfVectorizer(lowercase=True, ngram_range=(2,3))
# tfidf = TfidfVectorizer(lowercase=True, binary=True, ngram_range=(2,3))
tfidf.fit(train_data["text_a"])
train_data_vector_tfidf = tfidf.transform(train_data["text_a"])
test_data_vector_tfidf = tfidf.transform(test_data["text_a"])

<h1> Training </h1>

In [212]:
xgb_cv = XGBClassifier()
xgb_tfidf = XGBClassifier()

In [213]:
xgb_cv.fit(train_data_vector_cv, train_data["label"])

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [214]:
xgb_tfidf.fit(train_data_vector_tfidf, train_data["label"])

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [215]:
y_pred_cv = xgb_cv.predict(test_data_vector_cv)

In [216]:
y_pred_tfidf = xgb_tfidf.predict(test_data_vector_tfidf)

In [217]:
print("---Default XGBoost with CountVectorizer---")
print("Accuracy: ", accuracy_score(test_data["label"], y_pred_cv))
print("F1 Score: ", f1_score(test_data["label"], y_pred_cv))
print("\nClassification Report\n")
print(classification_report(test_data["label"], y_pred_cv))

---Default XGBoost with CountVectorizer---
Accuracy:  0.835
F1 Score:  0.6188118811881188

Classification Report

              precision    recall  f1-score   support

           0       0.86      0.94      0.89      2093
           1       0.74      0.53      0.62       707

    accuracy                           0.83      2800
   macro avg       0.80      0.73      0.76      2800
weighted avg       0.83      0.83      0.83      2800



In [218]:
print("---Default XGBoost with TF-IDF Vectorizer---")
print("Accuracy: ", accuracy_score(test_data["label"], y_pred_tfidf))
print("F1 Score: ", f1_score(test_data["label"], y_pred_tfidf))
print("\nClassification Report\n")
print(classification_report(test_data["label"], y_pred_tfidf))

---Default XGBoost with TF-IDF Vectorizer---
Accuracy:  0.8414285714285714
F1 Score:  0.6487341772151899

Classification Report

              precision    recall  f1-score   support

           0       0.87      0.93      0.90      2093
           1       0.74      0.58      0.65       707

    accuracy                           0.84      2800
   macro avg       0.80      0.75      0.77      2800
weighted avg       0.83      0.84      0.83      2800



<h4> References </h4>
<ol>
    <li>https://suatatan.com/posts/sklearn_xgboost_tc/ </li>
    <li>https://towardsdatascience.com/text-classification-with-nlp-tf-idf-vs-word2vec-vs-bert-41ff868d1794 </li>
    <li>https://www.youtube.com/watch?v=lBO1L8pgR9s</li>
    <li>https://kavita-ganesan.com/how-to-use-countvectorizer/#.Yy6JpOxByw4 </li>
</ol>
