In [1]:
import pretrait_tools as pt
import feature_engineering as fe  

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import pandas as pd
import nltk
from nltk.corpus import stopwords

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier

# every positive label is a separate column
onehot_cols = ['CQ','FD','FQ','GG','IR','JK','NF','O','OQ','PA','PF','RQ']
mlb = MultiLabelBinarizer(classes=onehot_cols)

def load_dataset(path):
    labels = []
    features = []
    with open(path, "r") as f:
        for line in f:
            if line.strip():
                parts = line.strip().split()
                raw_labels = parts[0].split('_')  
                feat = list(map(float, parts[1:]))
                labels.append(raw_labels)
                features.append(feat)
    # transform labels to one-hot encoding
    y = mlb.fit_transform(labels)
    X = np.array(features)
    return X, y


X_train, y_train = load_dataset("../data/msdialog/train_features.tsv")
X_valid, y_valid = load_dataset("../data/msdialog/valid_features.tsv")
X_test, y_test   = load_dataset("../data/msdialog/test_features.tsv")

# construct a multi-label random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
clf = MultiOutputClassifier(rf)

# train
clf.fit(X_train, y_train)

# validset prediction and results
y_valid_pred = clf.predict(X_valid)
print("Validation Classification Report:")
print(classification_report(y_valid, y_valid_pred, target_names=onehot_cols, zero_division=0))

# testset prediction and results
y_test_pred = clf.predict(X_test)
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=onehot_cols, zero_division=0))


Validation Classification Report:
              precision    recall  f1-score   support

          CQ       0.00      0.00      0.00        66
          FD       0.58      0.35      0.44       193
          FQ       0.36      0.08      0.13        65
          GG       0.63      0.46      0.54        41
          IR       0.67      0.24      0.35        84
          JK       1.00      0.08      0.15        12
          NF       0.50      0.02      0.04        47
           O       0.00      0.00      0.00         2
          OQ       0.98      0.97      0.97       221
          PA       0.82      0.86      0.84       365
          PF       0.71      0.41      0.52        97
          RQ       0.29      0.04      0.07        49

   micro avg       0.80      0.55      0.65      1242
   macro avg       0.54      0.29      0.34      1242
weighted avg       0.69      0.55      0.58      1242
 samples avg       0.68      0.61      0.63      1242

Test Classification Report:
              pre

In [2]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multioutput import MultiOutputClassifier

# ada boost classifier
base_estimator = AdaBoostClassifier(
    n_estimators=100,
    learning_rate=0.8,
    random_state=42
)
clf = MultiOutputClassifier(base_estimator)

clf.fit(X_train, y_train)

y_valid_pred = clf.predict(X_valid)
print("Validation Classification Report:")
print(classification_report(y_valid, y_valid_pred, target_names=onehot_cols, zero_division=0))

y_test_pred = clf.predict(X_test)
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=onehot_cols, zero_division=0))




Validation Classification Report:
              precision    recall  f1-score   support

          CQ       0.00      0.00      0.00        66
          FD       0.53      0.34      0.41       193
          FQ       0.50      0.12      0.20        65
          GG       0.71      0.49      0.58        41
          IR       0.54      0.15      0.24        84
          JK       0.33      0.08      0.13        12
          NF       0.27      0.06      0.10        47
           O       0.00      0.00      0.00         2
          OQ       0.98      0.97      0.97       221
          PA       0.79      0.86      0.82       365
          PF       0.69      0.49      0.57        97
          RQ       0.57      0.08      0.14        49

   micro avg       0.76      0.56      0.64      1242
   macro avg       0.49      0.30      0.35      1242
weighted avg       0.66      0.56      0.58      1242
 samples avg       0.67      0.62      0.63      1242

Test Classification Report:
              pre

In [3]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

# SVM
clf = OneVsRestClassifier(LinearSVC(random_state=42, max_iter=5000))

clf.fit(X_train, y_train)

y_valid_pred = clf.predict(X_valid)
print("Validation Classification Report:")
print(classification_report(y_valid, y_valid_pred, target_names=onehot_cols, zero_division=0))

y_test_pred = clf.predict(X_test)
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=onehot_cols, zero_division=0))


Validation Classification Report:
              precision    recall  f1-score   support

          CQ       0.00      0.00      0.00        66
          FD       0.56      0.31      0.39       193
          FQ       0.00      0.00      0.00        65
          GG       0.58      0.27      0.37        41
          IR       0.43      0.04      0.07        84
          JK       0.00      0.00      0.00        12
          NF       1.00      0.02      0.04        47
           O       0.00      0.00      0.00         2
          OQ       0.97      0.97      0.97       221
          PA       0.76      0.87      0.81       365
          PF       0.82      0.46      0.59        97
          RQ       0.00      0.00      0.00        49

   micro avg       0.79      0.52      0.63      1242
   macro avg       0.43      0.24      0.27      1242
weighted avg       0.63      0.52      0.54      1242
 samples avg       0.65      0.58      0.60      1242

Test Classification Report:
              pre

In [5]:
from sklearn.naive_bayes import GaussianNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

# GaussianNB
clf = OneVsRestClassifier(GaussianNB())

clf.fit(X_train, y_train)

y_valid_pred = clf.predict(X_valid)
print("Validation Classification Report:")
print(classification_report(y_valid, y_valid_pred, target_names=onehot_cols, zero_division=0))

y_test_pred = clf.predict(X_test)
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=onehot_cols, zero_division=0))


Validation Classification Report:
              precision    recall  f1-score   support

          CQ       0.22      0.33      0.27        66
          FD       0.41      0.32      0.36       193
          FQ       0.26      0.28      0.27        65
          GG       0.15      0.95      0.25        41
          IR       0.21      0.74      0.32        84
          JK       0.03      1.00      0.05        12
          NF       0.16      0.32      0.21        47
           O       0.01      1.00      0.01         2
          OQ       0.93      0.97      0.95       221
          PA       0.69      0.87      0.77       365
          PF       0.32      0.81      0.46        97
          RQ       0.24      0.59      0.35        49

   micro avg       0.31      0.70      0.43      1242
   macro avg       0.30      0.68      0.36      1242
weighted avg       0.52      0.70      0.57      1242
 samples avg       0.43      0.74      0.50      1242

Test Classification Report:
              pre

In [6]:
import numpy as np
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.neighbors import NearestNeighbors

class MyMultiLabelKNN:
    def __init__(self, k=10, threshold=0.5):
        self.k = k
        self.threshold = threshold

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.nn = NearestNeighbors(n_neighbors=self.k)
        self.nn.fit(X)

    def predict(self, X_test):
        distances, indices = self.nn.kneighbors(X_test)
        y_pred = []

        for neighbors in indices:
            neighbor_labels = self.y_train[neighbors]
            label_counts = np.sum(neighbor_labels, axis=0)
            label_pred = (label_counts / self.k) >= self.threshold
            y_pred.append(label_pred.astype(int))

        return np.array(y_pred)
    
clf = MyMultiLabelKNN(k=10, threshold=0.5)
clf.fit(X_train, y_train)

y_valid_pred = clf.predict(X_valid)
print("Validation Classification Report:")
print(classification_report(y_valid, y_valid_pred, target_names=onehot_cols, zero_division=0))

y_test_pred = clf.predict(X_test)
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=onehot_cols, zero_division=0))

Validation Classification Report:
              precision    recall  f1-score   support

          CQ       0.00      0.00      0.00        66
          FD       0.43      0.21      0.28       193
          FQ       0.60      0.05      0.09        65
          GG       0.62      0.24      0.35        41
          IR       0.41      0.11      0.17        84
          JK       0.20      0.08      0.12        12
          NF       0.00      0.00      0.00        47
           O       0.00      0.00      0.00         2
          OQ       0.96      0.87      0.91       221
          PA       0.66      0.85      0.74       365
          PF       0.71      0.51      0.59        97
          RQ       0.00      0.00      0.00        49

   micro avg       0.69      0.50      0.58      1242
   macro avg       0.38      0.24      0.27      1242
weighted avg       0.57      0.50      0.50      1242
 samples avg       0.60      0.55      0.56      1242

Test Classification Report:
              pre

In [None]:
# Feature Group Evaluation
def evaluate_feature_group(X_train, y_train, X_valid, y_valid, indices, group_name):
    X_train_sub = X_train[:, indices]
    X_valid_sub = X_valid[:, indices]

    clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
    clf.fit(X_train_sub, y_train)
    y_pred = clf.predict(X_valid_sub)

    micro = f1_score(y_valid, y_pred, average='micro')
    macro = f1_score(y_valid, y_pred, average='macro')
    print(f"{group_name:<15s} → Micro-F1: {micro:.4f}, Macro-F1: {macro:.4f}")

In [None]:
# group features
idx_content = list(range(1, 10))           # init_sim to 5W1H
idx_struct = list(range(10, 16))           # abs_pos to is_starter
idx_sentiment = list(range(16, 24))       # thx to lexicon count
idx_all = idx_content + idx_struct + idx_sentiment
idx_con_str = idx_content + idx_struct
idx_con_sent = idx_content + idx_sentiment
idx_str_sent = idx_struct + idx_sentiment

In [None]:
# evaluate feature groups
evaluate_feature_group(X_train, y_train, X_valid, y_valid, idx_content, "Content")
evaluate_feature_group(X_train, y_train, X_valid, y_valid, idx_struct, "Structural")
evaluate_feature_group(X_train, y_train, X_valid, y_valid, idx_sentiment, "Sentiment")
evaluate_feature_group(X_train, y_train, X_valid, y_valid, idx_con_str, "Content+Struct")
evaluate_feature_group(X_train, y_train, X_valid, y_valid, idx_con_sent, "Content+Sent")
evaluate_feature_group(X_train, y_train, X_valid, y_valid, idx_str_sent, "Struct+Sent")
evaluate_feature_group(X_train, y_train, X_valid, y_valid, idx_all, "All")

Content         → Micro-F1: 0.2898, Macro-F1: 0.1666
Structural      → Micro-F1: 0.5696, Macro-F1: 0.2557
Sentiment       → Micro-F1: 0.2721, Macro-F1: 0.1484
Content+Struct  → Micro-F1: 0.6187, Macro-F1: 0.2927
Content+Sent    → Micro-F1: 0.3723, Macro-F1: 0.2019
Struct+Sent     → Micro-F1: 0.6278, Macro-F1: 0.3035
All             → Micro-F1: 0.6495, Macro-F1: 0.3331


In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from skmultilearn.problem_transform import LabelPowerset
from sklearn.preprocessing import MultiLabelBinarizer

# feature names list
feature_names = [
    "init_sim", "thread_sim", "question_mark", "duplicate",
    "who", "what", "when", "where", "why", "how",
    "abs_pos", "norm_pos", "length", "unique_len", "stemmed_len", "is_starter",
    "has_thanks", "exclam_mark", "verbal_feedback",
    "sent_neg", "sent_neu", "sent_pos",
    "lexicon_pos", "lexicon_neg"
]

# use label powerset for multi-label classification
rf = RandomForestClassifier(n_estimators=100, random_state=42)
lp = LabelPowerset(classifier=rf)
lp.fit(X_train, y_train)

# classifier and normalized feature importances
importances = lp.classifier.feature_importances_
normalized = importances / np.max(importances)
sorted_idx = np.argsort(normalized)[::-1]

print("🔍 Normalized Feature Importances (max=1.0):")
for rank, idx in enumerate(sorted_idx, 1):
    print(f"{rank:>2}. {feature_names[idx]:<15} → {normalized[idx]:.4f}")


🔍 Normalized Feature Importances (max=1.0):
 1. init_sim        → 1.0000
 2. abs_pos         → 0.9399
 3. thread_sim      → 0.7594
 4. norm_pos        → 0.7006
 5. sent_pos        → 0.6578
 6. sent_neu        → 0.6480
 7. length          → 0.5818
 8. unique_len      → 0.5225
 9. is_starter      → 0.5221
10. stemmed_len     → 0.5098
11. sent_neg        → 0.4044
12. question_mark   → 0.2206
13. has_thanks      → 0.1446
14. who             → 0.1158
15. what            → 0.1044
16. exclam_mark     → 0.1039
17. duplicate       → 0.0916
18. how             → 0.0887
19. lexicon_pos     → 0.0773
20. why             → 0.0534
21. verbal_feedback → 0.0508
22. when            → 0.0464
23. lexicon_neg     → 0.0390
24. where           → 0.0349
