# Domain specific classification

In [10]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from skopt import BayesSearchCV
from skopt.callbacks import VerboseCallback
from skopt.space import Real, Integer, Categorical
from sklearn.metrics import balanced_accuracy_score, f1_score, make_scorer
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import numpy as np

import pickle
import scipy

from scipy.sparse import save_npz, load_npz

SEED = 2608


In [6]:
data1 = []
with open('../data/raw/comp90051-2024s1-project-1/domain1_train_data.json', 'r') as f:
    for line in f:
        data1.append(json.loads(line))

data2 = []
with open('../data/raw/comp90051-2024s1-project-1/domain2_train_data.json', 'r') as f:
    for line in f:
        data2.append(json.loads(line))

data_test = []
with open('../data/raw/comp90051-2024s1-project-1/test_data.json', 'r') as f:
    for line in f:
        data_test.append(json.loads(line))
        
# create domain labels for data
for i in range(len(data1)):
    data1[i]['domain'] = 1
for i in range(len(data2)):
    data2[i]['domain'] = 2
    
# Train Val Test Split

# get labels for stratification
label1 = [instance['label'] for instance in data1]
label2 = [instance['label'] for instance in data2]

train_ix_1, val_test_ix_1 = train_test_split(range(len(data1)), test_size=0.3, random_state=SEED, stratify = label1)
train_ix_2, val_test_ix_2 = train_test_split(range(len(data2)), test_size=0.3, random_state=SEED, stratify = label2)
val_ix_1, test_ix_1 = train_test_split(val_test_ix_1, test_size=0.5, random_state=SEED, stratify = [data1[i]['label'] for i in val_test_ix_1])
val_ix_2, test_ix_2 = train_test_split(val_test_ix_2, test_size=0.5, random_state=SEED, stratify = [data2[i]['label'] for i in val_test_ix_2])

# split data according to the index from train_test_split
train_data_1 = [data1[i] for i in train_ix_1]
val_data_1 = [data1[i] for i in val_ix_1]
test_data_1 = [data1[i] for i in test_ix_1]
train_data_2 = [data2[i] for i in train_ix_2]
val_data_2 = [data2[i] for i in val_ix_2]
test_data_2 = [data2[i] for i in test_ix_2]

# combine the data
train_data = train_data_1 + train_data_2
val_data = val_data_1 + val_data_2
test_data = test_data_1 + test_data_2

# get text, label, domain and id for each split
train_text = [' '.join([str(encode) for encode in instance['text']]) for instance in train_data]
val_text = [' '.join([str(encode) for encode in instance['text']]) for instance in val_data]
test_text = [' '.join([str(encode) for encode in instance['text']]) for instance in test_data]
future_text = [' '.join([str(encode) for encode in instance['text']]) for instance in data_test]

train_label = [instance['label'] for instance in train_data]
val_label = [instance['label'] for instance in val_data]
test_label = [instance['label'] for instance in test_data]

train_domain = [instance['domain'] for instance in train_data]
val_domain = [instance['domain'] for instance in val_data]
test_domain = [instance['domain'] for instance in test_data]

train_id = list(range(len(train_data)))
val_id = list(range(len(val_data)))
test_id = list(range(len(test_data)))

In [48]:
train_df = pd.DataFrame({'text': train_text, 'label': train_label, 'domain': train_domain})
val_df = pd.DataFrame({'text': val_text, 'label': val_label, 'domain': val_domain})

# model_param_grids = {
#     'XGBClassifier': {
#         'model__n_estimators': [50, 100, 200],
#         'model__max_depth': [3, 5, 7],
#         'model__learning_rate': [0.01, 0.05, 0.1],
#         'model__subsample': [0.8, 1.0],
#         'model__colsample_bytree': [0.8, 1.0],
#         'model__random_state': [SEED], 
#         'model__use_label_encoder': [False], 
#         'model__eval_metric': ['logloss']
#     },
#     'CatBoostClassifier': {
#         'model__iterations': [50, 100, 200],
#         'model__depth': [4, 5, 6],
#         'model__learning_rate': [0.01, 0.05, 0.1],
#         'model__l2_leaf_reg': [1, 3, 5],
#         'model__border_count': [32, 64, 128],
#         'model__verbose': [False],
#         'model__task_type': ['CPU'], 
#         'model__random_seed': [SEED]
#     },
#     'LGBMClassifier': {
#         'model__n_estimators': [50, 100, 200],
#         'model__num_leaves': [31, 63, 127],
#         'model__learning_rate': [0.01, 0.05, 0.1],
#         'model__max_depth': [-1, 5, 10],
#         'model__subsample': [0.8, 1.0],
#         'model__colsample_bytree': [0.8, 1.0],
#         'model__random_state': [SEED],
#         'model__verbose': [-1]
#     }
# }

model_search_spaces = {
    'XGBClassifier': {
        'model__n_estimators': Integer(50, 200),
        'model__max_depth': Integer(3, 7),
        'model__learning_rate': Real(0.01, 0.1, prior='log-uniform'),
        'model__subsample': Real(0.8, 1.0),
        'model__colsample_bytree': Real(0.8, 1.0),
        'model__eval_metric': Categorical(['logloss']),
        'model__use_label_encoder': Categorical([False]),
        'model__random_state': Categorical([SEED])
    },
    'CatBoostClassifier': {
        'model__iterations': Integer(50, 200),
        'model__depth': Integer(4, 6),
        'model__learning_rate': Real(0.01, 0.1, prior='log-uniform'),
        'model__l2_leaf_reg': Integer(1, 5),
        'model__border_count': Integer(32, 128),
        'model__verbose': Categorical([False]),
        'model__task_type': Categorical(['CPU']),
        'model__random_seed': Categorical([SEED])
    },
    'LGBMClassifier': {
        'model__n_estimators': Integer(50, 200),
        'model__num_leaves': Integer(31, 127),
        'model__learning_rate': Real(0.01, 0.1, prior='log-uniform'),
        'model__max_depth': Integer(-1, 10),
        'model__subsample': Real(0.8, 1.0),
        'model__colsample_bytree': Real(0.8, 1.0),
        'model__random_state': Categorical([SEED]),
        'model__verbose': Categorical([-1])
    }
}


# Custom scoring function that considers both F1 score and balanced accuracy
def custom_score(y_true, y_pred):
    return (f1_score(y_true, y_pred, average='binary') + balanced_accuracy_score(y_true, y_pred)) / 2

custom_scorer = make_scorer(custom_score, greater_is_better=True)


def train_and_evaluate_bayesian(model_name, search_spaces, train_texts, train_labels, val_texts, val_labels):
    if model_name == 'XGBClassifier':
        model = XGBClassifier(random_state=SEED, use_label_encoder=False, eval_metric='logloss')
    elif model_name == 'CatBoostClassifier':
        model = CatBoostClassifier(task_type='CPU', verbose=False, random_seed=SEED)
    elif model_name == 'LGBMClassifier':
        model = LGBMClassifier(random_state=SEED, verbose=-1)
    else:
        raise ValueError("Model name not recognized.")

    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('model', model)
    ])

    verbose_callback = VerboseCallback(n_total=30)

    opt = BayesSearchCV(
        estimator=pipeline,
        search_spaces=search_spaces,
        n_iter=30,
        scoring=custom_scorer,
        cv=5,
        n_jobs=-1,
        verbose=1,
        random_state=SEED
    )

    opt.fit(train_texts, train_labels, callback=[verbose_callback])

    best_model = opt.best_estimator_
    
    y_pred = best_model.predict(val_texts)

    bal_acc = balanced_accuracy_score(val_labels, y_pred)
    f1 = f1_score(val_labels, y_pred, average='binary')
    custom_score_val = (f1 + bal_acc) / 2

    print(f"\nBest Model for {model_name}: {opt.best_params_}")
    print(f"Custom Score (Avg of F1 and Balanced Acc): {custom_score_val}")
    print(f"F1 Score: {f1}")
    print(f"Balanced Accuracy: {bal_acc}")




In [14]:
# Iterate over domains and global data
domains = [1, 2]
for domain in domains:
    print(f"\n--- Training for Domain {domain} ---")
    domain_train_df = train_df[train_df['domain'] == domain]
    domain_val_df = val_df[val_df['domain'] == domain]
    
    for model_name, params in model_search_spaces.items():
        print(f"\nTraining {model_name} for Domain {domain}:")
        train_and_evaluate_bayesian(model_name, params, domain_train_df['text'], domain_train_df['label'], domain_val_df['text'], domain_val_df['label'])




--- Training for Domain 1 ---

Training XGBClassifier for Domain 1:
Iteration No: 1 started. Searching for the next optimal point.
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Iteration No: 1 ended. Search finished for the next optimal point.
Time taken: 2.6963
Function value obtained: -0.7015
Current minimum: -0.7015
Iteration No: 2 started. Searching for the next optimal point.
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Iteration No: 2 ended. Search finished for the next optimal point.
Time taken: 7.6739
Function value obtained: -0.7278
Current minimum: -0.7278
Iteration No: 3 started. Searching for the next optimal point.
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Iteration No: 3 ended. Search finished for the next optimal point.
Time taken: 4.0386
Function value obtained: -0.6870
Current minimum: -0.7278
Iteration No: 4 started. Searching for the next optimal point.
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Iteration N

In [16]:
# Domain 1
# Best Model for XGBClassifier: OrderedDict([('model__colsample_bytree', 0.8), ('model__eval_metric', 'logloss'), ('model__learning_rate', 0.1), ('model__max_depth', 7), ('model__n_estimators', 200), ('model__random_state', 2608), ('model__subsample', 0.9389123852661309), ('model__use_label_encoder', False)])
# Custom Score (Avg of F1 and Balanced Acc): 0.7566926070038911
# F1 Score: 0.7600518806744487
# Balanced Accuracy: 0.7533333333333334

# Best Model for CatBoostClassifier: OrderedDict([('model__border_count', 128), ('model__depth', 6), ('model__iterations', 200), ('model__l2_leaf_reg', 5), ('model__learning_rate', 0.1), ('model__random_seed', 2608), ('model__task_type', 'CPU'), ('model__verbose', False)])
# Custom Score (Avg of F1 and Balanced Acc): 0.7493124456048739
# F1 Score: 0.751958224543081
# Balanced Accuracy: 0.7466666666666667

# Best Model for LGBMClassifier: OrderedDict([('model__colsample_bytree', 0.8), ('model__learning_rate', 0.1), ('model__max_depth', 0), ('model__n_estimators', 200), ('model__num_leaves', 96), ('model__random_state', 2608), ('model__subsample', 0.9614722243365308), ('model__verbose', -1)])
# Custom Score (Avg of F1 and Balanced Acc): 0.768954248366013
# F1 Score: 0.7712418300653594
# Balanced Accuracy: 0.7666666666666666

# Domain 2
# Best Model for XGBClassifier: OrderedDict([('model__colsample_bytree', 0.9509656934433964), ('model__eval_metric', 'logloss'), ('model__learning_rate', 0.09977125026745362), ('model__max_depth', 7), ('model__n_estimators', 188), ('model__random_state', 2608), ('model__subsample', 0.8176104457553566), ('model__use_label_encoder', False)])
# Custom Score (Avg of F1 and Balanced Acc): 0.4801831922322667
# F1 Score: 0.3498349834983498
# Balanced Accuracy: 0.6105314009661836

# Best Model for CatBoostClassifier: OrderedDict([('model__border_count', 32), ('model__depth', 6), ('model__iterations', 200), ('model__l2_leaf_reg', 1), ('model__learning_rate', 0.1), ('model__random_seed', 2608), ('model__task_type', 'CPU'), ('model__verbose', False)])
# Custom Score (Avg of F1 and Balanced Acc): 0.412109500805153
# F1 Score: 0.2518518518518519
# Balanced Accuracy: 0.5723671497584542


# Best Model for LGBMClassifier: OrderedDict([('model__colsample_bytree', 0.8362026127124959), ('model__learning_rate', 0.061215231693238024), ('model__max_depth', 0), ('model__n_estimators', 200), ('model__num_leaves', 31), ('model__random_state', 2608), ('model__subsample', 1.0), ('model__verbose', -1)])
# Custom Score (Avg of F1 and Balanced Acc): 0.5102960684716591
# F1 Score: 0.3922829581993569
# Balanced Accuracy: 0.6283091787439614

In [38]:
# extract above parameters for evaluation on test
# Domain 1
xgb_1_clf_param = {'model__colsample_bytree': 0.8, 'model__eval_metric': 'logloss', 'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__n_estimators': 200, 'model__random_state': 2608, 'model__subsample': 0.9389123852661309, 'model__use_label_encoder': False}
cat_1_clf_param = {'model__border_count': 128, 'model__depth': 6, 'model__iterations': 200, 'model__l2_leaf_reg': 5, 'model__learning_rate': 0.1, 'model__random_seed': 2608, 'model__task_type': 'CPU', 'model__verbose': False}
lgb_1_clf_param = {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.1, 'model__max_depth': 0, 'model__n_estimators': 200, 'model__num_leaves': 96, 'model__random_state': 2608, 'model__subsample': 0.9614722243365308, 'model__verbose': -1}
# Adjusted parameters for CatBoostClassifier
cat_1_clf_param_adjusted = {
    'border_count': cat_1_clf_param['model__border_count'],
    'depth': cat_1_clf_param['model__depth'],
    'iterations': cat_1_clf_param['model__iterations'],
    'l2_leaf_reg': cat_1_clf_param['model__l2_leaf_reg'],
    'learning_rate': cat_1_clf_param['model__learning_rate'],
    'random_seed': cat_1_clf_param['model__random_seed'],
    'task_type': cat_1_clf_param['model__task_type'],
    'verbose': cat_1_clf_param['model__verbose']
}


# Domain 2
xgb_2_clf_param = {'model__colsample_bytree': 0.9509656934433964, 'model__eval_metric': 'logloss', 'model__learning_rate': 0.09977125026745362, 'model__max_depth': 7, 'model__n_estimators': 188, 'model__random_state': 2608, 'model__subsample': 0.8176104457553566, 'model__use_label_encoder': False}
cat_2_clf_param = {'model__border_count': 32, 'model__depth': 6, 'model__iterations': 200, 'model__l2_leaf_reg': 1, 'model__learning_rate': 0.1, 'model__random_seed': 2608, 'model__task_type': 'CPU', 'model__verbose': False}
cat_2_clf_param_adjusted = {
    'border_count': cat_2_clf_param['model__border_count'],
    'depth': cat_2_clf_param['model__depth'],
    'iterations': cat_2_clf_param['model__iterations'],
    'l2_leaf_reg': cat_2_clf_param['model__l2_leaf_reg'],
    'learning_rate': cat_2_clf_param['model__learning_rate'],
    'random_seed': cat_2_clf_param['model__random_seed'],
    'task_type': cat_2_clf_param['model__task_type'],
    'verbose': cat_2_clf_param['model__verbose']
}
lgb_2_clf_param = {'model__colsample_bytree': 0.8362026127124959, 'model__learning_rate': 0.061215231693238024, 'model__max_depth': 0, 'model__n_estimators': 200, 'model__num_leaves': 31, 'model__random_state': 2608, 'model__subsample': 1.0, 'model__verbose': -1} 

In [37]:
# prepare data for evaluation adding training and validation 

eval_1_df = pd.concat([train_df[train_df['domain'] == 1], val_df[val_df['domain'] == 1]])

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train = tfidf_vectorizer.fit_transform(eval_1_df['text'])

# Convert test_data_1 to a DataFrame if it's a list
test_data_1_df = pd.DataFrame(test_data_1)
# Convert the list of tokens back to strings for TF-IDF Vectorization
test_data_1_df['text_str'] = test_data_1_df['text'].apply(lambda tokens: ' '.join(map(str, tokens)))

# Now use the 'text_str' column for transformation
X_test = tfidf_vectorizer.transform(test_data_1_df['text_str'])

In [41]:
# Initialize the models with the extracted best parameters
xgb_model1 = XGBClassifier(**xgb_1_clf_param)

# Train the model on the vectorized text
xgb_model1.fit(X_train, eval_1_df['label'])

# Predict on the test set
pred_xgb = xgb_model1.predict(X_test)

bal_acc = balanced_accuracy_score(test_data_1_df['label'], pred_xgb)
f1 = f1_score(test_data_1_df['label'], pred_xgb, average='binary')
custom_score_val = (f1 + bal_acc) / 2

print(f"Custom Score (Avg of F1 and Balanced Acc): {custom_score_val}")
print(f"F1 Score: {f1}")
print(f"Balanced Accuracy: {bal_acc}")


Parameters: { "model__colsample_bytree", "model__eval_metric", "model__learning_rate", "model__max_depth", "model__n_estimators", "model__random_state", "model__subsample", "model__use_label_encoder" } are not used.



Domain 1

Custom Score (Avg of F1 and Balanced Acc): 0.763419689119171
F1 Score: 0.766839378238342
Balanced Accuracy: 0.76


In [39]:
catboost_model1 = CatBoostClassifier(**cat_1_clf_param_adjusted)
 
# Train the model on the vectorized text
catboost_model1.fit(X_train, eval_1_df['label'])

# Predict on the test set
pred_cat = catboost_model1.predict(X_test)

bal_acc = balanced_accuracy_score(test_data_1_df['label'], pred_cat)
f1 = f1_score(test_data_1_df['label'], pred_cat, average='binary')
custom_score_val = (f1 + bal_acc) / 2

print(f"Custom Score (Avg of F1 and Balanced Acc): {custom_score_val}")
print(f"F1 Score: {f1}")
print(f"Balanced Accuracy: {bal_acc}")

Custom Score (Avg of F1 and Balanced Acc): 0.7547687074829932
F1 Score: 0.7602040816326532
Balanced Accuracy: 0.7493333333333333


In [40]:
lgbm_model1 = LGBMClassifier(**lgb_1_clf_param)

# Train the model on the vectorized text
lgbm_model1.fit(X_train, eval_1_df['label'])

# Predict on the test set
pred_lgbm = lgbm_model1.predict(X_test)

bal_acc = balanced_accuracy_score(test_data_1_df['label'], pred_lgbm)
f1 = f1_score(test_data_1_df['label'], pred_lgbm, average='binary')
custom_score_val = (f1 + bal_acc) / 2

print(f"Custom Score (Avg of F1 and Balanced Acc): {custom_score_val}")
print(f"F1 Score: {f1}")
print(f"Balanced Accuracy: {bal_acc}")

[LightGBM] [Info] Number of positive: 2125, number of negative: 2125
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.169297 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 92819
[LightGBM] [Info] Number of data points in the train set: 4250, number of used features: 2159
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Custom Score (Avg of F1 and Balanced Acc): 0.7598677110530896
F1 Score: 0.7624020887728459
Balanced Accuracy: 0.7573333333333333


In [42]:
# prepare data for evaluation adding training and validation 

eval_2_df = pd.concat([train_df[train_df['domain'] == 2], val_df[val_df['domain'] == 2]])

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train = tfidf_vectorizer.fit_transform(eval_2_df['text'])

# Convert test_data_1 to a DataFrame if it's a list
test_data_2_df = pd.DataFrame(test_data_2)
# Convert the list of tokens back to strings for TF-IDF Vectorization
test_data_2_df['text_str'] = test_data_2_df['text'].apply(lambda tokens: ' '.join(map(str, tokens)))

# Now use the 'text_str' column for transformation
X_test = tfidf_vectorizer.transform(test_data_2_df['text_str'])

In [44]:
# Initialize the models with the extracted best parameters
xgb_model2 = XGBClassifier(**xgb_2_clf_param)

# Train the model on the vectorized text
xgb_model2.fit(X_train, eval_2_df['label'])

# Predict on the test set
pred_xgb = xgb_model2.predict(X_test)

bal_acc = balanced_accuracy_score(test_data_2_df['label'], pred_xgb)
f1 = f1_score(test_data_2_df['label'], pred_xgb, average='binary')
custom_score_val = (f1 + bal_acc) / 2

print(f"Custom Score (Avg of F1 and Balanced Acc): {custom_score_val}")
print(f"F1 Score: {f1}")
print(f"Balanced Accuracy: {bal_acc}")


Parameters: { "model__colsample_bytree", "model__eval_metric", "model__learning_rate", "model__max_depth", "model__n_estimators", "model__random_state", "model__subsample", "model__use_label_encoder" } are not used.



Custom Score (Avg of F1 and Balanced Acc): 0.4862135875583803
F1 Score: 0.3588039867109634
Balanced Accuracy: 0.6136231884057971


In [45]:
catboost_model2 = CatBoostClassifier(**cat_2_clf_param_adjusted)
 
# Train the model on the vectorized text
catboost_model2.fit(X_train, eval_2_df['label'])

# Predict on the test set
pred_cat = catboost_model2.predict(X_test)

bal_acc = balanced_accuracy_score(test_data_2_df['label'], pred_cat)
f1 = f1_score(test_data_2_df['label'], pred_cat, average='binary')
custom_score_val = (f1 + bal_acc) / 2

print(f"Custom Score (Avg of F1 and Balanced Acc): {custom_score_val}")
print(f"F1 Score: {f1}")
print(f"Balanced Accuracy: {bal_acc}")

Custom Score (Avg of F1 and Balanced Acc): 0.3590775966183575
F1 Score: 0.17187500000000003
Balanced Accuracy: 0.546280193236715


In [46]:
lgbm_model2 = LGBMClassifier(**lgb_2_clf_param)

# Train the model on the vectorized text
lgbm_model2.fit(X_train, eval_2_df['label'])

# Predict on the test set
pred_lgbm = lgbm_model2.predict(X_test)

bal_acc = balanced_accuracy_score(test_data_2_df['label'], pred_lgbm)
f1 = f1_score(test_data_2_df['label'], pred_lgbm, average='binary')
custom_score_val = (f1 + bal_acc) / 2

print(f"Custom Score (Avg of F1 and Balanced Acc): {custom_score_val}")
print(f"F1 Score: {f1}")
print(f"Balanced Accuracy: {bal_acc}")

[LightGBM] [Info] Number of positive: 1275, number of negative: 9775
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.396122 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 345034
[LightGBM] [Info] Number of data points in the train set: 11050, number of used features: 6513
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.115385 -> initscore=-2.036882
[LightGBM] [Info] Start training from score -2.036882
Custom Score (Avg of F1 and Balanced Acc): 0.4991765044630003
F1 Score: 0.3774834437086093
Balanced Accuracy: 0.6208695652173913


# Domain classification

In [23]:
data1 = []
with open('../data/raw/comp90051-2024s1-project-1/domain1_train_data.json', 'r') as f:
    for line in f:
        data1.append(json.loads(line))

data2 = []
with open('../data/raw/comp90051-2024s1-project-1/domain2_train_data.json', 'r') as f:
    for line in f:
        data2.append(json.loads(line))

# create domain labels for data
for i in range(len(data1)):
    data1[i]['domain'] = 1
for i in range(len(data2)):
    data2[i]['domain'] = 2
    
# Train Val Test Split
data = data1 + data2

# Extract domain labels for stratification
domains = [item['domain'] for item in data]

# Split the combined data into training and validation+test sets
train_data, temp_data, train_labels, temp_labels = train_test_split(
    data, domains, test_size=0.3, random_state=SEED, stratify=domains)

# Split the temp_data further into validation and test sets
val_data, test_data, val_labels, test_labels = train_test_split(
    temp_data, temp_labels, test_size=0.5, random_state=SEED, stratify=temp_labels)


In [55]:
train_domain_clf_df = pd.DataFrame({'text': [instance['text'] for instance in train_data], 'domain': [instance['domain'] for instance in train_data]})
val_domain_clf_df = pd.DataFrame({'text': [instance['text'] for instance in val_data], 'domain': [instance['domain'] for instance in val_data]})
test_domain_clf_df = pd.DataFrame({'text': [instance['text'] for instance in test_data], 'domain': [instance['domain'] for instance in test_data]})

train_domain_clf_df['text'] = train_domain_clf_df['text'].apply(lambda x: ' '.join(map(str, x)))
val_domain_clf_df['text'] = val_domain_clf_df['text'].apply(lambda x: ' '.join(map(str, x)))
test_domain_clf_df['text'] = test_domain_clf_df['text'].apply(lambda x: ' '.join(map(str, x)))

# Adjust domain labels to start from 0
train_domain_clf_df['domain'] = train_domain_clf_df['domain'] - 1
val_domain_clf_df['domain'] = val_domain_clf_df['domain'] - 1
test_domain_clf_df['domain'] = test_domain_clf_df['domain'] - 1




In [56]:
for model_name, params in model_search_spaces.items():
    print(f"\nTraining {model_name} for Domain {domain}:")
    train_and_evaluate_bayesian(model_name, params, train_domain_clf_df['text'], train_domain_clf_df['domain'], val_domain_clf_df['text'], val_domain_clf_df['domain'])




Training XGBClassifier for Domain 2:
Iteration No: 1 started. Searching for the next optimal point.
Fitting 5 folds for each of 1 candidates, totalling 5 fits


Iteration No: 1 ended. Search finished for the next optimal point.
Time taken: 7.7443
Function value obtained: -0.9775
Current minimum: -0.9775
Iteration No: 2 started. Searching for the next optimal point.
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Iteration No: 2 ended. Search finished for the next optimal point.
Time taken: 21.7845
Function value obtained: -0.9876
Current minimum: -0.9876
Iteration No: 3 started. Searching for the next optimal point.
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Iteration No: 3 ended. Search finished for the next optimal point.
Time taken: 12.3412
Function value obtained: -0.9621
Current minimum: -0.9876
Iteration No: 4 started. Searching for the next optimal point.
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Iteration No: 4 ended. Search finished for the next optimal point.
Time taken: 15.5705
Function value obtained: -0.9817
Current minimum: -0.9876
Iteration No: 5 started. Searching for the next optimal

In [None]:
# Best Model for XGBClassifier: OrderedDict([('model__colsample_bytree', 0.8145194965386008), ('model__eval_metric', 'logloss'), ('model__learning_rate', 0.1), ('model__max_depth', 7), ('model__n_estimators', 200), ('model__random_state', 2608), ('model__subsample', 0.830583668052698), ('model__use_label_encoder', False)])
# Custom Score (Avg of F1 and Balanced Acc): 0.9931305695670449
# F1 Score: 0.9953893442622951
# Balanced Accuracy: 0.9908717948717949

# Best Model for CatBoostClassifier: OrderedDict([('model__border_count', 32), ('model__depth', 6), ('model__iterations', 200), ('model__l2_leaf_reg', 1), ('model__learning_rate', 0.1), ('model__random_seed', 2608), ('model__task_type', 'CPU'), ('model__verbose', False)])
# Custom Score (Avg of F1 and Balanced Acc): 0.9874976719784904
# F1 Score: 0.9918158567774935
# Balanced Accuracy: 0.9831794871794872

# Best Model for LGBMClassifier: OrderedDict([('model__colsample_bytree', 0.9684578910057579), ('model__learning_rate', 0.1), ('model__max_depth', -1), ('model__n_estimators', 200), ('model__num_leaves', 50), ('model__random_state', 2608), ('model__subsample', 0.9431436290094395), ('model__verbose', -1)])
# Custom Score (Avg of F1 and Balanced Acc): 0.9935384615384615
# F1 Score: 0.9953846153846154
# Balanced Accuracy: 0.9916923076923077

In [57]:
xgb_domain_clf_param = {'model__colsample_bytree': 0.8145194965386008, 'model__eval_metric': 'logloss', 'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__n_estimators': 200, 'model__random_state': 2608, 'model__subsample': 0.830583668052698, 'model__use_label_encoder': False}
cat_domain_clf_param = {'model__border_count': 32, 'model__depth': 6, 'model__iterations': 200, 'model__l2_leaf_reg': 1, 'model__learning_rate': 0.1, 'model__random_seed': 2608, 'model__task_type': 'CPU', 'model__verbose': False}
lgb_domain_clf_param = {'model__colsample_bytree': 0.9684578910057579, 'model__learning_rate': 0.1, 'model__max_depth': -1, 'model__n_estimators': 200, 'model__num_leaves': 50, 'model__random_state': 2608, 'model__subsample': 0.9431436290094395, 'model__verbose': -1}

cat_clf_param_adjusted = {
    'border_count': cat_domain_clf_param['model__border_count'],
    'depth': cat_domain_clf_param['model__depth'],
    'iterations': cat_domain_clf_param['model__iterations'],
    'l2_leaf_reg': cat_domain_clf_param['model__l2_leaf_reg'],
    'learning_rate': cat_domain_clf_param['model__learning_rate'],
    'random_seed': cat_domain_clf_param['model__random_seed'],
    'task_type': cat_domain_clf_param['model__task_type'],
    'verbose': cat_domain_clf_param['model__verbose']
}




In [58]:

eval_df = pd.concat([train_domain_clf_df, val_domain_clf_df])

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train = tfidf_vectorizer.fit_transform(eval_df['text'])

# Now use the 'text_str' column for transformation
X_test = tfidf_vectorizer.transform(test_domain_clf_df['text'])

In [62]:
lgbm_domain_clf = LGBMClassifier(**lgb_domain_clf_param)

# Train the model on the vectorized text
lgbm_domain_clf.fit(X_train, eval_df['domain'])

# Predict on the test set
pred_lgbm = lgbm_domain_clf.predict(X_test)

bal_acc = balanced_accuracy_score(test_domain_clf_df['domain'], pred_lgbm)
f1 = f1_score(test_domain_clf_df['domain'], pred_lgbm, average='binary')
custom_score_val = (f1 + bal_acc) / 2

print(f"Custom Score (Avg of F1 and Balanced Acc): {custom_score_val}")
print(f"F1 Score: {f1}")
print(f"Balanced Accuracy: {bal_acc}")

[LightGBM] [Info] Number of positive: 11050, number of negative: 4250
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.495891 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 408527
[LightGBM] [Info] Number of data points in the train set: 15300, number of used features: 7696
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.722222 -> initscore=0.955511
[LightGBM] [Info] Start training from score 0.955511
Custom Score (Avg of F1 and Balanced Acc): 0.9962057853303368
F1 Score: 0.9974372116863147
Balanced Accuracy: 0.9949743589743589


In [64]:
xgb_domain_clf = XGBClassifier(**xgb_domain_clf_param)

# Train the model on the vectorized text
xgb_domain_clf.fit(X_train, eval_df['domain'])

# Predict on the test set
pred_xgb = xgb_domain_clf.predict(X_test)

bal_acc = balanced_accuracy_score(test_domain_clf_df['domain'], pred_xgb)
f1 = f1_score(test_domain_clf_df['domain'], pred_xgb, average='binary')
custom_score_val = (f1 + bal_acc) / 2

print(f"Custom Score (Avg of F1 and Balanced Acc): {custom_score_val}")
print(f"F1 Score: {f1}")
print(f"Balanced Accuracy: {bal_acc}")


Parameters: { "model__colsample_bytree", "model__eval_metric", "model__learning_rate", "model__max_depth", "model__n_estimators", "model__random_state", "model__subsample", "model__use_label_encoder" } are not used.



Custom Score (Avg of F1 and Balanced Acc): 0.9939008813897475
F1 Score: 0.9961607371384694
Balanced Accuracy: 0.9916410256410256


In [65]:
cat_domain_clf = CatBoostClassifier(**cat_clf_param_adjusted)

# Train the model on the vectorized text
cat_domain_clf.fit(X_train, eval_df['domain'])

# Predict on the test set
pred_cat = cat_domain_clf.predict(X_test)

bal_acc = balanced_accuracy_score(test_domain_clf_df['domain'], pred_cat)
f1 = f1_score(test_domain_clf_df['domain'], pred_cat, average='binary')
custom_score_val = (f1 + bal_acc) / 2

print(f"Custom Score (Avg of F1 and Balanced Acc): {custom_score_val}")
print(f"F1 Score: {f1}")
print(f"Balanced Accuracy: {bal_acc}")

Custom Score (Avg of F1 and Balanced Acc): 0.98970295595877
F1 Score: 0.9936110401226681
Balanced Accuracy: 0.9857948717948718
