In [12]:
import csv
import re
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
def semantic_hard_complaint_rule(chat):
    chat = str(chat)
    disappointed_words = ["kecewa", "tidak puas", "kurang puas", "brengsek", "bangsat", "brgsk", "bgst", "anjing"]
    for dw in disappointed_words:
        if dw in chat:
            return 1
    return 0

In [3]:
chat_opening = []
with open("opening_chat_for_fasttext.csv") as file:
    readCSV = csv.reader(file)
    for row in readCSV:
        chat = row[0].lower()
        chat = re.sub(r"\.|\?|,|#|!|\n", "", chat)
        chat_opening.append(chat.split(' '))

In [15]:
from gensim.models import FastText
from gensim.test.utils import common_texts

size_output = 256
model = FastText(chat_opening, size= size_output, window= 8)

In [86]:
import pandas as pd
data = pd.read_csv("complaint_data_2k.csv")

In [87]:
chat_data = np.array(data['Chat'].values.tolist())
chat_label = np.array(data['Label'].values.tolist())

In [88]:
for i, chat in enumerate(chat_data):
    chat = chat.lower()
    chat = re.sub(r"\.|\?|,|#|!|\n", "", chat)
    chat = re.sub(r"[A-Z]*[0-9]+[A-Z]*", "", chat)
    chat_data[i] = chat
    
vect = TfidfVectorizer()
vect.fit(chat_data)
feature_names = vect.get_feature_names()
feature_map = {feat:idx for idx, feat in enumerate(feature_names)}

In [89]:
def splitter(sentence):
    return re.findall(r"[\w']+", sentence)

def oov_handler(word_list, fasttext, word_dict):
    return_list = []
    for w in word_list:
        if w not in word_dict:
            try:
                most_similar = fasttext.wv.most_similar(positive=[w])[0]
                if most_similar[1] > 0.9:
                    return_list.append(most_similar[0])
            except:
                continue
        else:
            return_list.append(w)
    return return_list

def sentence_maker(word_list):
    sentence = ""
    for w in word_list:
        sentence += w + " "
    return sentence

In [90]:
chat_encoded = []
for chat in chat_data:
    encoded_words = []
    for w in chat.split(' '):
        if len(w) > 1:
            try:
                encoded_words.append(model.wv[w])
            except:
                continue
    if len(encoded_words) > 0:
        chat_encoded.append(np.mean(encoded_words, axis= 0))
    else:
        chat_encoded.append(np.zeros(size_output))

chat_encoded = np.array(chat_encoded)

In [91]:
chat_tf_idf = []
for chat in chat_data:
    encoded_words = []
    weights = []
    
    word_list = oov_handler(splitter(chat), model, feature_map.keys())
    tf_idf_vect = vect.transform([sentence_maker(word_list)])
    for w in set(word_list):
        try:
            weight, enc = tf_idf_vect[0, feature_map[w]], model.wv[w] 
            weights.append(weight)
            encoded_words.append(enc)
        except:
            continue
            
    if len(encoded_words) > 0:
        chat_tf_idf.append(np.average(encoded_words, axis= 0, weights= weights))
    else:
        chat_tf_idf.append(np.zeros(size_output))

chat_tf_idf = np.array(chat_tf_idf)

  if np.issubdtype(vec.dtype, np.int):


In [92]:
kf = KFold(n_splits= 10, shuffle= True)

## Mean (not using weight)

In [93]:
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_encoded[train_idx], chat_encoded[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    cls = LinearSVC()
#     print(train_x.shape)
    cls.fit(train_x, train_y)
    
    prediction = []
    for idx in test_idx:
#         pred = semantic_hard_complaint_rule(chat_data[idx])
#         if pred != 1:
        pred = cls.predict([chat_encoded[idx]])[0]
        prediction.append(pred)
#     print(prediction)
#     prediction = cls.predict(test_x)
    print(classification_report(y_pred= prediction, y_true= test_y))
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

              precision    recall  f1-score   support

           0       0.87      0.80      0.83       112
           1       0.77      0.84      0.81        89

   micro avg       0.82      0.82      0.82       201
   macro avg       0.82      0.82      0.82       201
weighted avg       0.82      0.82      0.82       201

              precision    recall  f1-score   support

           0       0.89      0.88      0.88        91
           1       0.90      0.91      0.90       110

   micro avg       0.90      0.90      0.90       201
   macro avg       0.89      0.89      0.89       201
weighted avg       0.90      0.90      0.90       201

              precision    recall  f1-score   support

           0       0.91      0.90      0.91       115
           1       0.87      0.88      0.88        86

   micro avg       0.90      0.90      0.90       201
   macro avg       0.89      0.89      0.89       201
weighted avg       0.90      0.90      0.90       201

              preci

## Average using weight from tf-idf

In [94]:
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_tf_idf[train_idx], chat_tf_idf[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    cls = LinearSVC()
#     print(train_x.shape)
    cls.fit(train_x, train_y)
    
    prediction = []
    for idx in test_idx:
#         pred = semantic_hard_complaint_rule(chat_data[idx])
#         if pred != 1:
        pred = cls.predict([chat_tf_idf[idx]])[0]
        prediction.append(pred)
#     print(prediction)
#     prediction = cls.predict(test_x)
    print(classification_report(y_pred= prediction, y_true= test_y))
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

              precision    recall  f1-score   support

           0       0.89      0.78      0.83       102
           1       0.80      0.90      0.85        99

   micro avg       0.84      0.84      0.84       201
   macro avg       0.85      0.84      0.84       201
weighted avg       0.85      0.84      0.84       201

              precision    recall  f1-score   support

           0       0.92      0.89      0.91       102
           1       0.89      0.92      0.91        99

   micro avg       0.91      0.91      0.91       201
   macro avg       0.91      0.91      0.91       201
weighted avg       0.91      0.91      0.91       201

              precision    recall  f1-score   support

           0       0.90      0.88      0.89       102
           1       0.88      0.90      0.89        99

   micro avg       0.89      0.89      0.89       201
   macro avg       0.89      0.89      0.89       201
weighted avg       0.89      0.89      0.89       201

              preci



              precision    recall  f1-score   support

           0       0.86      0.81      0.84       100
           1       0.82      0.87      0.85       101

   micro avg       0.84      0.84      0.84       201
   macro avg       0.84      0.84      0.84       201
weighted avg       0.84      0.84      0.84       201

              precision    recall  f1-score   support

           0       0.90      0.83      0.87       112
           1       0.80      0.89      0.84        88

   micro avg       0.85      0.85      0.85       200
   macro avg       0.85      0.86      0.85       200
weighted avg       0.86      0.85      0.86       200

              precision    recall  f1-score   support

           0       0.86      0.79      0.82       113
           1       0.75      0.84      0.79        87

   micro avg       0.81      0.81      0.81       200
   macro avg       0.81      0.81      0.81       200
weighted avg       0.82      0.81      0.81       200

              preci