In [2]:
import csv
import re
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [4]:
chat_opening = []
with open("opening_chat_for_fasttext.csv") as file:
    readCSV = csv.reader(file)
    for row in readCSV:
        chat = row[0].lower()
        chat = re.sub(r"\.|\?|,|#|!|\n", "", chat)
        chat_opening.append(chat.split(' '))

In [5]:
from gensim.models import FastText
from gensim.test.utils import common_texts

size_output = 256
model = FastText(chat_opening, size= size_output, window= 8)

In [21]:
import pandas as pd
data = pd.read_csv("complaint_data_2k.csv")

In [22]:
chat_data = np.array(data['Chat'].values.tolist())
chat_label = np.array(data['Label'].values.tolist())

In [23]:
for i, chat in enumerate(chat_data):
    chat = chat.lower()
    chat = re.sub(r"\.|\?|,|#|!|\n", "", chat)
    chat = re.sub(r"[A-Z]*[0-9]+[A-Z]*", "", chat)
    chat_data[i] = chat

In [24]:
chat_encoded = []
for chat in chat_data:
    encoded_words = []
    for w in chat.split(' '):
        if len(w) > 1:
            try:
                encoded_words.append(model.wv[w])
            except:
                continue
    if len(encoded_words) > 0:
        chat_encoded.append(np.mean(encoded_words, axis= 0))
    else:
        chat_encoded.append(np.zeros(size_output))

chat_encoded = np.array(chat_encoded)

In [25]:
kf = KFold(n_splits= 10, shuffle= True)

## XGBoost

In [26]:
## raw-data, using TF
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data[train_idx], chat_data[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", CountVectorizer(ngram_range=(1,2))),
                        ("cls", XGBClassifier())])
    pipeline.fit(train_x, train_y)
    
    prediction = pipeline.predict(test_x)
#     for i, pred in enumerate(prediction):
#         if pred != 1:
#             prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9292587064676617


In [27]:
## raw-data, using TF-IDF
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data[train_idx], chat_data[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", TfidfVectorizer(ngram_range=(1,2))),
                        ("cls", XGBClassifier())])
    pipeline.fit(train_x, train_y)
    
    prediction = pipeline.predict(test_x)
#     for i, pred in enumerate(prediction):
#         if pred != 1:
#             prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9212860696517412


## Voting Classifier

In [13]:
def semantic_hard_complaint_rule(chat):
    chat = str(chat)
    disappointed_words = ["kecewa", "tidak puas", "kurang puas", "brengsek", "bangsat", "brgsk", "bgst", "anjing"]
    for dw in disappointed_words:
        if dw in chat:
            return 1
    return 0

In [32]:
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_enc_x, test_enc_x = chat_encoded[train_idx], chat_encoded[test_idx]
    train_x, test_x = chat_data[train_idx], chat_data[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    svc_ft = LinearSVC()
    svc_ft.fit(train_enc_x, train_y)
    
    svc = Pipeline([("enc", TfidfVectorizer(ngram_range=(1,2))),
                        ("cls", LinearSVC())])
    svc.fit(train_x, train_y)

    prediction = []
    for idx in test_idx:
#         pred = semantic_hard_complaint_rule(chat_data[idx])
#         if pred != 1:
        pred = [svc_ft.predict([chat_encoded[idx]])[0],
               svc.predict([chat_data[idx]])[0],
               semantic_hard_complaint_rule(chat_data[idx])]
        
        ensemble_pred = max(set(pred), key=pred.count)
        prediction.append(ensemble_pred)
#     print(prediction)
#     prediction = cls.predict(test_x)
    print(classification_report(y_pred= prediction, y_true= test_y))
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

              precision    recall  f1-score   support

           0       0.95      0.98      0.96       110
           1       0.98      0.93      0.96        91

   micro avg       0.96      0.96      0.96       201
   macro avg       0.96      0.96      0.96       201
weighted avg       0.96      0.96      0.96       201

              precision    recall  f1-score   support

           0       0.95      0.98      0.96       111
           1       0.98      0.93      0.95        90

   micro avg       0.96      0.96      0.96       201
   macro avg       0.96      0.96      0.96       201
weighted avg       0.96      0.96      0.96       201

              precision    recall  f1-score   support

           0       0.89      0.98      0.94       104
           1       0.98      0.88      0.92        97

   micro avg       0.93      0.93      0.93       201
   macro avg       0.94      0.93      0.93       201
weighted avg       0.93      0.93      0.93       201





              precision    recall  f1-score   support

           0       0.94      0.99      0.96       117
           1       0.99      0.90      0.94        84

   micro avg       0.96      0.96      0.96       201
   macro avg       0.96      0.95      0.95       201
weighted avg       0.96      0.96      0.95       201





              precision    recall  f1-score   support

           0       0.91      0.99      0.95       104
           1       0.99      0.90      0.94        97

   micro avg       0.95      0.95      0.95       201
   macro avg       0.95      0.94      0.94       201
weighted avg       0.95      0.95      0.95       201

              precision    recall  f1-score   support

           0       0.88      0.99      0.93       103
           1       0.99      0.86      0.92        98

   micro avg       0.93      0.93      0.93       201
   macro avg       0.93      0.92      0.92       201
weighted avg       0.93      0.93      0.92       201

              precision    recall  f1-score   support

           0       0.91      1.00      0.96        96
           1       1.00      0.91      0.96       105

   micro avg       0.96      0.96      0.96       201
   macro avg       0.96      0.96      0.96       201
weighted avg       0.96      0.96      0.96       201

              preci