In [1]:
import csv
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import requests
import json

In [2]:
def semantic_hard_complaint_rule(chat):
    chat = str(chat)
    disappointed_words = ["kecewa", "tidak puas", "kurang puas", "brengsek", "bangsat", "brgsk", "bgst", "anjing"]
    for dw in disappointed_words:
        if dw in chat:
            return 1
    return 0

def formalizer(string):
    req = requests.post("http://127.0.0.1:9000/formalizer", json= {"string": string})
    response = req.json()
    if response['status'] == 'success':
        return response['data']
    else:
        print('formalizer ' + str(response) + str(string))
        return None

In [23]:
import pandas as pd
data = pd.read_csv("complaint_data.csv")

In [24]:
chat_data = np.array(data['Chat'].values.tolist())
chat_label = np.array(data['Label'].values.tolist())

In [25]:
chat_data_normalized = []
for chat in chat_data:
    chat_data_normalized.append(formalizer(chat))

chat_data_normalized = np.array(chat_data_normalized)

In [6]:
kf = KFold(n_splits= 10, shuffle= True)

## Only Using Rule-Based

In [30]:
## raw-data
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data[train_idx], chat_data[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.7656577193984389


In [31]:
## raw-data
f1_score = []
for train_idx, test_idx in kf.split(chat_label):
    train_x, test_x = chat_data_normalized[train_idx], chat_data_normalized[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    prediction = [semantic_hard_complaint_rule(x) for x in np.nditer(test_x)]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.7656101275461641


## Using LinearSVC

In [35]:
## raw-data, using TF
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data[train_idx], chat_data[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", CountVectorizer(ngram_range=(1,2))),
                        ("cls", LinearSVC())])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9591471540072339


In [42]:
## raw-data, using TF with normalizer
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data_normalized[train_idx], chat_data_normalized[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", CountVectorizer(ngram_range=(1,2))),
                        ("cls", LinearSVC())])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9562821245002855


In [43]:
## raw-data, using TF with minimum occurence and normalizer
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data_normalized[train_idx], chat_data_normalized[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", CountVectorizer(ngram_range=(1,2), min_df= 2)),
                        ("cls", LinearSVC())])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9513516086046069


In [35]:
## raw-data, using TF-IDF
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data[train_idx], chat_data[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", TfidfVectorizer(ngram_range=(1,2))),
                        ("cls", LinearSVC())])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9620692937369123


In [39]:
## raw-data, using TF-IDF and minimum occurence and normalizer
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data_normalized[train_idx], chat_data_normalized[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", TfidfVectorizer(ngram_range=(1,2), min_df= 2)),
                        ("cls", LinearSVC())])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9649628783552255


In [40]:
## raw-data, using TF-IDF with normalizer 
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data_normalized[train_idx], chat_data_normalized[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", TfidfVectorizer(ngram_range=(1,2))),
                        ("cls", LinearSVC())])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9640491147915476


## Using RandomForest

In [37]:
## raw-data, using TF
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data[train_idx], chat_data[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", CountVectorizer(ngram_range=(1,2))),
                    ("cls", RandomForestClassifier(n_estimators= 50))])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.947449076718066


In [44]:
## raw-data, using TF and normalizer
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data_normalized[train_idx], chat_data_normalized[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", CountVectorizer(ngram_range=(1,2))),
                    ("cls", RandomForestClassifier(n_estimators= 50))])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9464972396725682


In [45]:
## raw-data, using TF and normalizer with minimum occurence
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data_normalized[train_idx], chat_data_normalized[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", CountVectorizer(ngram_range=(1,2), min_df= 2)),
                    ("cls", RandomForestClassifier(n_estimators= 50))])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9484009137635636


In [38]:
## raw-data, using TF-IDF
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data[train_idx], chat_data[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", TfidfVectorizer(ngram_range=(1,2))),
                    ("cls", RandomForestClassifier(n_estimators= 50))])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9474585950885208


In [46]:
## raw-data, using TF-IDF with normalizer
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data_normalized[train_idx], chat_data_normalized[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", TfidfVectorizer(ngram_range=(1,2))),
                    ("cls", RandomForestClassifier(n_estimators= 50))])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9474776318294309


In [47]:
## raw-data, using TF-IDF with normalizer and minimum occurence
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data_normalized[train_idx], chat_data_normalized[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", TfidfVectorizer(ngram_range=(1,2), min_df= 2)),
                    ("cls", RandomForestClassifier(n_estimators= 50))])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9513420902341518
