In [39]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [40]:
root_path = "drive/MyDrive/IF/Sem7/NLP/2/"

## Libraries

In [41]:
! pip install Sastrawi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [42]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from xgboost import XGBClassifier
import nltk
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import sys
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval

In [43]:
with open(f"{root_path}slang.txt") as f:
    data = f.read()
slang_words = json.loads(data)

## Reading the Data

In [44]:
def tokenize(sent):
    tokens = sent.split()
    tokens = list(filter(lambda token: len(token) > 1, tokens))
    return tokens

def netralize_token(sent):
    for i, word in enumerate(sent):
      if word in slang_words:
        sent[i] = slang_words[word]
    return sent

In [45]:
# Split label and features
def split_dataframe(df):
    df_features = df.loc[:, "text_a"].str.lower()

    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()

    df_features = df_features.apply(lambda x: stopword.remove(x))
    df_features = df_features.apply(lambda x: tokenize(x))
    df_features = df_features.apply(lambda x: netralize_token(x))

    return df_features, df.loc[:, "label"]

## Function

In [46]:
def join_array_to_sentence(df):
  for i, list_words in enumerate(df):
    s = ' '.join(map(str, list_words))
    df[i] = s
  return df

### Training Data

In [47]:
df_train = pd.read_csv(f"{root_path}train.csv")
df_train.drop("Unnamed: 0", axis=1, inplace=True)
df_train

Unnamed: 0,text_a,label
0,betewe buka twitter cuman ngetweet liat home b...,no
1,mas piyuuu mugo2 corona tuh mulut tersumpal ma...,no
2,e100ss gini buka informasi sejelas nya identit...,yes
3,neng solo wes ono terduga corona cobo neng ati...,no
4,midiahn nii akun gak takut takut nya isu coron...,no
...,...,...
21596,depok panas ga karuan kereta sampe pasming huj...,no
21597,oxfara arie kriting yg lebi goblo nya orang ke...,no
21598,virus corona menyaba depok cuci tangan makan n...,no
21599,mata sipit tinggal depok udah abis dah bahan c...,no


In [48]:
X_train, y_train = split_dataframe(df_train)

### Validation Data

In [49]:
df_val = pd.read_csv(f"{root_path}dev.csv")
df_val

Unnamed: 0,text_a,label
0,jek dajal ga depok bang,no
1,detikcom untung depok masuk wilayah nya ridwan...,no
2,df dom jakarta depok yg gunain vc cabang nya c...,no
3,your2rl depok jkt,no
4,doakan indonesia selamat virus corona pkb depo...,yes
...,...,...
2795,ku tenang2 bae ku sih ya corona nya ga depok k...,no
2796,guru hati hati ya virus corona uda indonesia t...,yes
2797,4 terawan menyebut virus corona indonesia terd...,yes
2798,realffk buhari can t pronounce corona virus,no


In [50]:
X_val, y_val = split_dataframe(df_val)

### Test Data

In [51]:
df_test = pd.read_csv(f"{root_path}test.csv")
df_test

Unnamed: 0,text_a,label
0,jek dajal ga depok bang,no
1,detikcom untung depok masuk wilayah nya ridwan...,no
2,df dom jakarta depok yg gunain vc cabang nya c...,no
3,your2rl depok jkt,no
4,doakan indonesia selamat virus corona pkb depo...,yes
...,...,...
2795,ku tenang2 bae ku sih ya corona nya ga depok k...,no
2796,guru hati hati ya virus corona uda indonesia t...,yes
2797,4 terawan menyebut virus corona indonesia terd...,yes
2798,realffk buhari can t pronounce corona virus,no


In [52]:
X_test, y_test = split_dataframe(df_test)

In [53]:
X_train = join_array_to_sentence(X_train)
X_test = join_array_to_sentence(X_test)

## Modelling

In [54]:
# fit model no training data

cv = CountVectorizer(
      max_features=5000, 
      encoding="utf-8", 
      ngram_range = (1,3), 
      token_pattern = "[A-Za-z_][A-Za-z\d_]*"
)
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.fit_transform(X_test).toarray()

y_train = df_train['label']
y_train = y_train.replace({"no": 0, "yes": 1})
y_test = df_test['label']
y_test = y_test.replace({"no": 0, "yes": 1})

In [59]:
# Define the search space
param_grid = { 
    # Percentage of columns to be randomly samples for each tree.
    "colsample_bytree": [ 0.3, 0.5 , 0.8 ],
    # reg_alpha provides l1 regularization to the weight, higher values result in more conservative models
    "reg_alpha": [0, 0.5, 1, 5],
    # reg_lambda provides l2 regularization to the weight, higher values result in more conservative models
    "reg_lambda": [0, 0.5, 1, 5]
    }
# Set up score
scoring = ['recall', 'accuracy', 'precision']
# Set up the k-fold cross-validation
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

In [60]:
grid_search = GridSearchCV(estimator=XGBClassifier(), 
                           param_grid=param_grid, 
                           scoring=scoring, 
                           refit='accuracy', 
                           n_jobs=-1, 
                           cv=kfold, 
                           verbose=0)
# Fit grid search
grid_result = grid_search.fit(X_train, y_train)
# Print grid search summary
grid_result
# Print the best score and the corresponding hyperparameters
print(f'The best score is {grid_result.best_score_:.4f}')
print('The best score standard deviation is', round(grid_result.cv_results_['std_test_recall'][grid_result.best_index_], 4))
print(f'The best hyperparameters are {grid_result.best_params_}')



The best score is 0.8136
The best score standard deviation is 0.0112
The best hyperparameters are {'colsample_bytree': 0.8, 'reg_alpha': 1, 'reg_lambda': 1}


### Objective Function

In [63]:
clf = XGBClassifier(colsample_bytree=grid_result.best_params_['colsample_bytree'], reg_alpha=grid_result.best_params_['reg_alpha'], reg_lambda=grid_result.best_params_['reg_lambda'])
clf.fit(X_train, y_train)

# make predictions for test data
y_pred = clf.predict(X_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions

accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


precision = precision_score(y_test, predictions,  average='macro')
print("Precision: %.2f%%" % (precision * 100.0))

recall = recall_score(y_test, predictions,  average='macro')
print("Recall: %.2f%%" % (recall * 100.0))

Accuracy: 74.57%
Precision: 37.35%
Recall: 49.88%
