In [179]:
import json

import numpy as np
import pandas as pd
import seaborn as sns
from pathlib2 import Path
from collections import defaultdict, OrderedDict
from bert_serving.client import BertClient
import fasttext
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import normalize

from vocab import Language, HelpDeskType
from data import parse_labels

In [120]:
bc = BertClient()

In [22]:
def read_raw_data(data_dir, lang):
    help_desks_train = {
        "CZ": None,
        "DX": None,
        "LT": None,
        "OT": None
    }

    help_desks_test = {
        "CZ": None,
        "DX": None,
        "LT": None,
        "OT": None
    }
    
    if lang == Language.chinese:
        for h in help_desks.keys():
            with open(data_dir / "train_data_cn_{}.json".format(h), 'r', encoding="utf-8") as train_fp:
                help_desks_train[h] = json.load(train_fp)
            with open(data_dir / "test_data_cn_{}.json".format(h), 'r', encoding="utf-8") as test_fp:
                help_desks_test[h] = json.load(test_fp)
        return help_desks_train, help_desks_test

In [29]:
def process_raw_data(raw_data, is_train):    
    def data_gen():
        for dialogue in raw_data:
            senders = []
            texts = []
            for turn in dialogue["turns"]:
                sender = 1 if turn["sender"].startswith("c") else 0
                senders.append(sender)
                text = " ".join(turn["utterances"])
                texts.append(text)
                
            if is_train:
                customer_nugget_label, helpdesk_nugget_label, quality_label = \
                    parse_labels(dialogue["annotations"], senders)
                yield (dialogue["id"],
                       senders,
                       texts,
                       customer_nugget_label,
                       helpdesk_nugget_label,
                       quality_label)

            else:
                yield (dialogue["id"],
                       senders,
                       texts)
    data = [x for x in data_gen()]
    return data

In [259]:
data_dir = Path("stc3dataset/data")
output_dir = Path("output")
lang = Language.chinese
help_desks_train, help_desks_test = read_raw_data(data_dir, lang)

In [37]:
help_desks_train = {k: process_raw_data(v, True) for k, v in help_desks_train.items()}
help_desks_test = {k: process_raw_data(v, False) for k, v in help_desks_test.items()}

### No distinguishing helpdesks + BERT embedding + No context + XGBoost

In [298]:
def split_to_c_and_h(df):
    return df.loc[df["sender"] == "C"], df.loc[df["sender"] != "C"]

def get_bert_embedding(df):
    texts = df["text"].to_list()
    empty_indices = []
    
    # Fill empty string with 'EMPTY'
    for i, t in enumerate(texts):
        if not t.strip():
            texts[i] = "EMPTY"
            empty_indices.append(i)
    
    embeddings = bc.encode(texts).tolist()
    
    print("Empty indices:", empty_indices)
    
    # Replace     
    for i in empty_indices:
        embeddings[i] = np.zeros((768,))
        
    return np.asarray(embeddings)

def merge_c_and_h(df_test, c_test, h_test):
    Y = df_test.groupby(["id"])["sender"].apply(list).reset_index(name="senders").to_dict("records")
    c_i = 0
    h_i = 0
    for i, dialogue in enumerate(Y):
        Y[i]["id"] = str(Y[i]["id"])
        Y[i]["nugget"] = []
        for sender in dialogue["senders"]:
            if sender == "C":
                c_probs = np.asarray([c_test[i][c_i] for i in range(len(c_test))])
                c_probs = np.clip(c_probs, 0, 1)
                c_probs = c_probs / c_probs.sum(axis=0, keepdims=1)
                c_probs = c_probs.tolist()
                Y[i]["nugget"].append({
                    "CNUG0": c_probs[0],
                    "CNUG": c_probs[1],
                    "CNUG*": c_probs[2],
                    "CNaN": c_probs[3]
                })
                c_i += 1
            else:
                h_probs = np.asarray([h_test[i][h_i] for i in range(len(h_test))])
                h_probs = np.clip(h_probs, 0, 1)
                h_probs = h_probs / h_probs.sum(axis=0, keepdims=1) 
                h_probs = h_probs.tolist()
                Y[i]["nugget"].append({
                    "HNUG": h_probs[0],
                    "HNUG*": h_probs[1],
                    "HNaN": h_probs[2]
                })
                h_i += 1
    return Y

def merge_c_and_h_cat(df_test, c_test, h_test):
    Y = df_test.groupby(["id"])["sender"].apply(list).reset_index(name="senders").to_dict("records")
    c_i = 0
    h_i = 0
    for i, dialogue in enumerate(Y):
        Y[i]["id"] = str(Y[i]["id"])
        Y[i]["nugget"] = []
        for sender in dialogue["senders"]:
            if sender == "C":
                c_probs = c_test[c_i].tolist()
                Y[i]["nugget"].append({
                    "CNUG0": c_probs[0],
                    "CNUG": c_probs[1],
                    "CNUG*": c_probs[2],
                    "CNaN": c_probs[3]
                })
                c_i += 1
            else:
                h_probs = h_test[h_i].tolist()
                Y[i]["nugget"].append({
                    "HNUG": h_probs[0],
                    "HNUG*": h_probs[1],
                    "HNaN": h_probs[2]
                })
                h_i += 1
    return Y

In [93]:
df_train = pd.read_csv(data_dir / "df_train_cn.csv", encoding="utf-8").astype({'text': 'str'})
df_test = pd.read_csv(data_dir / "df_test_cn.csv", encoding="utf-8").astype({'text': 'str'})

In [130]:
df_c_train, df_h_train = split_to_c_and_h(df_train)
df_c_test, df_h_test = split_to_c_and_h(df_test)

In [95]:
Y_h_train = df_h_train[["HNUG", "HNUG*", "HNaN"]].values
Y_c_train = df_c_train[["CNUG0", "CNUG", "CNUG*", "CNaN"]].values

[{'id': 11792090486, 'sender': ['C', 'H', 'C', 'H']},
 {'id': 3413387629431837, 'sender': ['C', 'H', 'C']},
 {'id': 3418165109705159, 'sender': ['C', 'H']},
 {'id': 3453944166480384, 'sender': ['C', 'H', 'C', 'H', 'C', 'H', 'C']},
 {'id': 3470299011884270, 'sender': ['C', 'H', 'C']},
 {'id': 3476768847913440, 'sender': ['C', 'H', 'C', 'H']},
 {'id': 3476892810205149, 'sender': ['C', 'H', 'C', 'H', 'C', 'H', 'C']},
 {'id': 3481107607724044, 'sender': ['C', 'H', 'C', 'H']},
 {'id': 3512239052883466, 'sender': ['C', 'H', 'C', 'H', 'C', 'H']},
 {'id': 3517638892555263, 'sender': ['C', 'H', 'C', 'H', 'C', 'H']},
 {'id': 3522021549603724, 'sender': ['C', 'H', 'C', 'H', 'C', 'H']},
 {'id': 3525416431833226, 'sender': ['C', 'H', 'C']},
 {'id': 3526760035640177, 'sender': ['C', 'H', 'C', 'H']},
 {'id': 3532553799247835, 'sender': ['C', 'H', 'C']},
 {'id': 3536694176921874, 'sender': ['C', 'H', 'C', 'H']},
 {'id': 3539168950594478, 'sender': ['C', 'H', 'C']},
 {'id': 3542788094283808, 'sender': 

In [124]:
X_h_train = get_bert_embedding(df_h_train)
X_c_train = get_bert_embedding(df_c_train)

Empty indices: [6860]


here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


Empty indices: [5758]


In [131]:
X_h_test = get_bert_embedding(df_h_test)
X_c_test = get_bert_embedding(df_c_test)

Empty indices: []
Empty indices: []


In [137]:
param_grid = {
    "n_estimators": [100],
    "learning_rate": [0.1],
    "subsample": [1],
    "max_depth": [3]
}

In [211]:
Y_c_train.shape

(8500, 4)

In [212]:
# c_xgb = XGBClassifier(objective="multi:softprob")
# c_xgb.fit(X_c_train, Y_c_train)
# Y_c_test = c_xgb.predict_proba(X_c_test)

In [147]:
class XGBManager:
    def __init__(self, param, num_target):
        self.num_target = num_target
        self.xgbs = [XGBRegressor() for i in range(self.num_target)]
        
    def train(self, X, Y):
        for i in range(self.num_target):
            self.xgbs[i].fit(X, [y[i] for y in Y])
        
    def predict(self, X):
        Y = []
        for i in range(self.num_target):
            Y.append(self.xgbs[i].predict(X))
        return Y

In [148]:
xgb_h = XGBManager(None, Y_h_train.shape[1])
xgb_h.train(X_h_train, Y_h_train)
Y_h_test = xgb_h.predict(X_h_test)



In [154]:
xgb_c = XGBManager(None, Y_c_train.shape[1])
xgb_c.train(X_c_train, Y_c_train)



In [157]:
Y_c_test = xgb_c.predict(X_c_test)

In [151]:
Y_h_test_new = np.asarray(Y_h_test)
Y_h_test_new.shape

(3, 780)

In [158]:
Y_c_test_new = np.asarray(Y_c_test)
Y_c_test_new.shape

(4, 975)

In [262]:
Y = merge_c_and_h(df_test, Y_c_test, Y_h_test)

{'jsd': 0.20680088636048105, 'rnss': 0.32577494692649511}

In [264]:
json_name = "test_data_cn_XGB.json"
with open(output_dir / json_name, 'w', encoding="utf-8") as fp:
    json.dump(Y, fp, ensure_ascii=False)

In [None]:
# model = XGBClassifier()
# kfold = KFold(n_splits=5, random_state=RANDOM_SEED)
# cross_validate
# grid = GridSearchCV(estimator=model, 
#                     param_grid=param_grid, 
#                     n_jobs=-1, 
#                     cv=kfold, 
#                     scoring=["f1", "accuracy"], 
#                     refit="f1"
#                    )
# grid_result = grid.fit(Y_h_train, Y_h_train)

In [266]:
Y_c_train_cat = np.argmax(Y_c_train, axis=1)

In [284]:
model_c = XGBClassifier(objective="multi:softprob")
model_c.fit(X_c_train, Y_c_train_cat)
Y_c_test_cat = model_c.predict_proba(X_c_test)

In [287]:
Y_h_train_cat = np.argmax(Y_h_train, axis=1)

In [288]:
model_h = XGBClassifier(objective="multi:softprob")
model_h.fit(X_h_train, Y_h_train_cat)
Y_h_test_cat = model_h.predict_proba(X_h_test)

In [299]:
Y_cat = merge_c_and_h_cat(df_test, Y_c_test_cat, Y_h_test_cat)

{'jsd': 0.28531068912190655, 'rnss': 0.40621648041542596}

In [300]:
json_name = "test_data_cn_XGB_cat.json"
with open(output_dir / json_name, 'w', encoding="utf-8") as fp:
    json.dump(Y_cat, fp, ensure_ascii=False)