In [20]:
import csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [2]:
def semantic_hard_complaint_rule(chat):
    disappointed_words = ["kecewa", "tidak puas", "kurang puas", "brengsek", "bangsat", "brgsk", "bgst", "anjing"]
    for dw in disappointed_words:
        if dw in chat:
            return 1
    return 0

In [3]:
import pandas as pd
data = pd.read_csv("complaint_data.csv")

In [4]:
chat_data = data['Chat'].values.tolist()
chat_label = data['Label'].values.tolist()

In [36]:
train_x, test_x, train_y, test_y = train_test_split(chat_data, chat_label, test_size= 0.1)

## Using LinearSVC

In [37]:
pipeline = Pipeline([("enc", CountVectorizer(ngram_range=(1,2))),
                    ("cls", LinearSVC())])

pipeline.fit(train_x, train_y)

Pipeline(memory=None,
     steps=[('enc', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [38]:
prediction = [semantic_hard_complaint_rule(x) for x in test_x]
for i, pred in enumerate(prediction):
    if pred != 1:
        prediction[i] = pipeline.predict([test_x[i]])[0]
print(classification_report(y_pred= prediction, y_true= test_y))
# prediction

              precision    recall  f1-score   support

           0       1.00      0.96      0.98        45
           1       0.97      1.00      0.98        58

   micro avg       0.98      0.98      0.98       103
   macro avg       0.98      0.98      0.98       103
weighted avg       0.98      0.98      0.98       103



## Using RandomForest

In [45]:
pipeline = Pipeline([("enc", CountVectorizer(ngram_range=(1,2))),
                    ("cls", RandomForestClassifier(n_estimators= 50))])

pipeline.fit(train_x, train_y)

Pipeline(memory=None,
     steps=[('enc', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [46]:
prediction = [semantic_hard_complaint_rule(x) for x in test_x]
for i, pred in enumerate(prediction):
    if pred != 1:
        prediction[i] = pipeline.predict([test_x[i]])[0]
print(classification_report(y_pred= prediction, y_true= test_y))
# prediction

              precision    recall  f1-score   support

           0       1.00      0.93      0.97        45
           1       0.95      1.00      0.97        58

   micro avg       0.97      0.97      0.97       103
   macro avg       0.98      0.97      0.97       103
weighted avg       0.97      0.97      0.97       103

