In [1]:
import csv
import re
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [2]:
def semantic_hard_complaint_rule(chat):
    chat = str(chat)
    disappointed_words = ["kecewa", "tidak puas", "kurang puas", "brengsek", "bangsat", "brgsk", "bgst", "anjing"]
    for dw in disappointed_words:
        if dw in chat:
            return 1
    return 0

In [3]:
chat_opening = []
with open("opening_chat_for_fasttext.csv") as file:
    readCSV = csv.reader(file)
    for row in readCSV:
        chat = row[0].lower()
        chat = re.sub(r"\.|\?|,|#|!|\n", "", chat)
        chat_opening.append(chat.split(' '))

In [10]:
from gensim.models import FastText
from gensim.test.utils import common_texts

size_output = 128
model = FastText(chat_opening, size= size_output, window= 5)

In [11]:
import pandas as pd
data = pd.read_csv("complaint_data.csv")

In [12]:
chat_data = np.array(data['Chat'].values.tolist())
chat_label = np.array(data['Label'].values.tolist())

In [25]:
for i, chat in enumerate(chat_data):
    chat = chat.lower()
    chat = re.sub(r"\.|\?|,|#|!|\n", "", chat)
    chat = re.sub(r"[A-Z]*[0-9]+[A-Z]*", "", chat)
    chat_data[i] = chat
    
vect = TfidfVectorizer()
transformed = vect.fit_transform(chat_data)
index_value={i[1]:i[0] for i in vect.vocabulary_.items()}

fully_indexed = []
for row in transformed:
    fully_indexed.append({index_value[column]:value for (column,value) in zip(row.indices,row.data)})

In [19]:
chat_encoded = []
for chat in chat_data:
    encoded_words = []
    for w in chat.split(' '):
        if len(w) > 1:
            try:
                encoded_words.append(model.wv[w])
            except:
                continue
    if len(encoded_words) > 0:
        chat_encoded.append(np.mean(encoded_words, axis= 0))
    else:
        chat_encoded.append(np.zeros(size_output))

chat_encoded = np.array(chat_encoded)

In [35]:
chat_tf_idf = []
for chat in chat_data:
    encoded_words = []
    weights = []
    sum = 0
    for w in chat.split(' '):
        if len(w) > 1:
            try:
                encoded_words.append(model.wv[w])
                if w in fully_indexed:
                    weights.append(fully_indexed[w])
                    sum += fully_indexed[w]
                else:
                    most_similar = model.wv.most_similar(positive=[w])[0][0]
                    print(fully_indexed[most_similar])
                    weights.append(fully_indexed[most_similar])
                    sum += fully_indexed[most_similar]
            except:
                continue
    if len(encoded_words) > 0:
        print((weights))
        print(len(encoded_words))
        print("-----------")
        chat_tf_idf.append(np.average(encoded_words, axis= 0, weights= weights))
    else:
        chat_tf_idf.append(np.zeros(size_output))

chat_tf_idf = np.array(chat_tf_idf)

[]
14
-----------


  if np.issubdtype(vec.dtype, np.int):


ValueError: Length of weights not compatible with specified axis.

In [14]:
kf = KFold(n_splits= 10, shuffle= True)

In [20]:
f1_score = []
for train_idx, test_idx in kf.split(chat_data):
    train_x, test_x = chat_encoded[train_idx], chat_encoded[test_idx]
    train_y, test_y = chat_label[train_idx], chat_label[test_idx]
    cls = LinearSVC()
#     print(train_x.shape)
    cls.fit(train_x, train_y)
    
    prediction = []
    for idx in test_idx:
        pred = semantic_hard_complaint_rule(chat_data[idx])
        if pred != 1:
            pred = cls.predict([chat_encoded[idx]])[0]
        prediction.append(pred)
#     print(prediction)
#     prediction = cls.predict(test_x)
#     print(classification_report(y_pred= prediction, y_true= test_y))
    f1_score.append(classification_report(y_pred= prediction, y_true= test_y, output_dict= True)['micro avg']['f1-score'])
    
print("Average f1-score: {}".format(np.mean(f1_score)))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97        57
           1       0.94      1.00      0.97        46

   micro avg       0.97      0.97      0.97       103
   macro avg       0.97      0.97      0.97       103
weighted avg       0.97      0.97      0.97       103

              precision    recall  f1-score   support

           0       0.93      0.82      0.87        49
           1       0.85      0.94      0.89        54

   micro avg       0.88      0.88      0.88       103
   macro avg       0.89      0.88      0.88       103
weighted avg       0.89      0.88      0.88       103

              precision    recall  f1-score   support

           0       0.89      0.81      0.85        52
           1       0.82      0.90      0.86        51

   micro avg       0.85      0.85      0.85       103
   macro avg       0.86      0.85      0.85       103
weighted avg       0.86      0.85      0.85       103

              preci