In [1]:
import csv
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import requests
import json

In [2]:
def semantic_hard_complaint_rule(chat):
    chat = str(chat)
    disappointed_words = ["kecewa", "tidak puas", "kurang puas", "brengsek", "bangsat", "brgsk", "bgst", "anjing"]
    for dw in disappointed_words:
        if dw in chat:
            return 1
    return 0

def formalizer(string):
    req = requests.post("http://127.0.0.1:9000/formalizer", json= {"string": string})
    response = req.json()
    if response['status'] == 'success':
        return response['data']
    else:
        print('formalizer ' + str(response) + str(string))
        return None
    
def stemmer(string):
    req = requests.post("http://127.0.0.1:9000/stemmer", json= {"string": string})
    response = req.json()
    if response['status'] == 'success':
        return response['data']
    else:
        print('stemmer ' + str(response))
        return None
    
def stopwords_removal(string):
    req = requests.post("http://127.0.0.1:9000/stopwords", json= {"string": string})
    response = req.json()
    if response['status'] == 'success':
        return response['data']
    else:
        print('stopwords ' + str(response))
        return None

In [3]:
import pandas as pd
data = pd.read_csv("complaint_data_2k.csv")

In [4]:
chat_data = np.array(data['Chat'].values.tolist())
chat_label = np.array(data['Label'].values.tolist())

In [7]:
chat_data_normalized = []
for chat in chat_data:
    normalized = formalizer(chat)
    chat_data_normalized.append(stemmer(normalized))

chat_data_normalized = np.array(chat_data_normalized)

In [8]:
kf = KFold(n_splits= 10, shuffle= True)

## Only Using Rule-Based

In [9]:
## raw-data
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data[train_idx], chat_data[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.7399378109452737


In [10]:
## raw-data
f1_score = []
for train_idx, test_idx in kf.split(chat_label):
    train_x, test_x = chat_data_normalized[train_idx], chat_data_normalized[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    prediction = [semantic_hard_complaint_rule(x) for x in np.nditer(test_x)]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.8290796019900499


## Using LinearSVC

In [11]:
## raw-data, using TF
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data[train_idx], chat_data[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", CountVectorizer(ngram_range=(1,2))),
                        ("cls", LinearSVC())])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9541517412935324


In [12]:
## raw-data, using TF with normalizer
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data_normalized[train_idx], chat_data_normalized[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", CountVectorizer(ngram_range=(1,2))),
                        ("cls", LinearSVC())])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9566417910447761


In [13]:
## raw-data, using TF with minimum occurence and normalizer
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data_normalized[train_idx], chat_data_normalized[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", CountVectorizer(ngram_range=(1,2), min_df= 2)),
                        ("cls", LinearSVC())])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9481716417910449


In [14]:
## raw-data, using TF-IDF
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data[train_idx], chat_data[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", TfidfVectorizer(ngram_range=(1,2))),
                        ("cls", LinearSVC())])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9686069651741296


In [15]:
## raw-data, using TF-IDF and minimum occurence and normalizer
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data_normalized[train_idx], chat_data_normalized[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", TfidfVectorizer(ngram_range=(1,2), min_df= 2)),
                        ("cls", LinearSVC())])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9700945273631841


In [16]:
## raw-data, using TF-IDF with normalizer 
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data_normalized[train_idx], chat_data_normalized[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", TfidfVectorizer(ngram_range=(1,2))),
                        ("cls", LinearSVC())])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9721094527363185


## Using RandomForest

In [17]:
## raw-data, using TF
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data[train_idx], chat_data[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", CountVectorizer(ngram_range=(1,2))),
                    ("cls", RandomForestClassifier(n_estimators= 50))])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9416840796019901


In [18]:
## raw-data, using TF and normalizer
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data_normalized[train_idx], chat_data_normalized[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", CountVectorizer(ngram_range=(1,2))),
                    ("cls", RandomForestClassifier(n_estimators= 50))])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9531691542288556


In [19]:
## raw-data, using TF and normalizer with minimum occurence
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data_normalized[train_idx], chat_data_normalized[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", CountVectorizer(ngram_range=(1,2), min_df= 2)),
                    ("cls", RandomForestClassifier(n_estimators= 50))])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9546567164179104


In [20]:
## raw-data, using TF-IDF
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data[train_idx], chat_data[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", TfidfVectorizer(ngram_range=(1,2))),
                    ("cls", RandomForestClassifier(n_estimators= 50))])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9447014925373134


In [21]:
## raw-data, using TF-IDF with normalizer
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data_normalized[train_idx], chat_data_normalized[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", TfidfVectorizer(ngram_range=(1,2))),
                    ("cls", RandomForestClassifier(n_estimators= 50))])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9546716417910449


In [22]:
## raw-data, using TF-IDF with normalizer and minimum occurence
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_data_normalized[train_idx], chat_data_normalized[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    
    pipeline = Pipeline([("enc", TfidfVectorizer(ngram_range=(1,2), min_df= 2)),
                    ("cls", RandomForestClassifier(n_estimators= 50))])
    pipeline.fit(train_x, train_y)
    
    prediction = [semantic_hard_complaint_rule(x) for x in test_x]
    for i, pred in enumerate(prediction):
        if pred != 1:
            prediction[i] = pipeline.predict([test_x[i]])[0]
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

Average f1-score: 0.9546517412935323
