# Prepare data and feature select

In [5]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from collections import defaultdict as dd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score
import pandas as pd

import pickle
import scipy

from scipy.sparse import save_npz, load_npz

In [6]:
data1 = []
with open('../data/raw/comp90051-2024s1-project-1/domain1_train_data.json', 'r') as f:
    for line in f:
        data1.append(json.loads(line))

data2 = []
with open('../data/raw/comp90051-2024s1-project-1/domain2_train_data.json', 'r') as f:
    for line in f:
        data2.append(json.loads(line))

data_test = []
with open('../data/raw/comp90051-2024s1-project-1/test_data.json', 'r') as f:
    for line in f:
        data_test.append(json.loads(line))

In [7]:
SEED = 2608

In [8]:
EXPERIMENT = 'Baseline'

---
# Train Val Test Split

In [9]:
# create domain labels for data
for i in range(len(data1)):
    data1[i]['domain'] = 1
for i in range(len(data2)):
    data2[i]['domain'] = 2

In [10]:
# Train Val Test Split

# get labels for stratification
label1 = [instance['label'] for instance in data1]
label2 = [instance['label'] for instance in data2]

train_ix_1, val_test_ix_1 = train_test_split(range(len(data1)), test_size=0.3, random_state=SEED, stratify = label1)
train_ix_2, val_test_ix_2 = train_test_split(range(len(data2)), test_size=0.3, random_state=SEED, stratify = label2)
val_ix_1, test_ix_1 = train_test_split(val_test_ix_1, test_size=0.5, random_state=SEED, stratify = [data1[i]['label'] for i in val_test_ix_1])
val_ix_2, test_ix_2 = train_test_split(val_test_ix_2, test_size=0.5, random_state=SEED, stratify = [data2[i]['label'] for i in val_test_ix_2])

# split data according to the index from train_test_split
train_data_1 = [data1[i] for i in train_ix_1]
val_data_1 = [data1[i] for i in val_ix_1]
test_data_1 = [data1[i] for i in test_ix_1]
train_data_2 = [data2[i] for i in train_ix_2]
val_data_2 = [data2[i] for i in val_ix_2]
test_data_2 = [data2[i] for i in test_ix_2]

# combine the data
train_data = train_data_1 + train_data_2
val_data = val_data_1 + val_data_2
test_data = test_data_1 + test_data_2

In [11]:
# get text, label, domain and id for each split
train_text = [' '.join([str(encode) for encode in instance['text']]) for instance in train_data]
val_text = [' '.join([str(encode) for encode in instance['text']]) for instance in val_data]
test_text = [' '.join([str(encode) for encode in instance['text']]) for instance in test_data]
future_text = [' '.join([str(encode) for encode in instance['text']]) for instance in data_test]

train_label = [instance['label'] for instance in train_data]
val_label = [instance['label'] for instance in val_data]
test_label = [instance['label'] for instance in test_data]

train_domain = [instance['domain'] for instance in train_data]
val_domain = [instance['domain'] for instance in val_data]
test_domain = [instance['domain'] for instance in test_data]

train_id = list(range(len(train_data)))
val_id = list(range(len(val_data)))
test_id = list(range(len(test_data)))

In [12]:
# TFIDF
TFIDF = TfidfVectorizer(max_features=83581) # from EDA

TFIDF.fit(train_text)
train_tfidf = TFIDF.transform(train_text)
val_tfidf = TFIDF.transform(val_text)
test_tfidf = TFIDF.transform(test_text)
future_tfidf = TFIDF.transform(future_text)

In [21]:
# Bag of Words
BoW = CountVectorizer(max_features=83581) # from EDA

BoW.fit(train_text)
train_bow = BoW.transform(train_text)
val_bow = BoW.transform(val_text)
test_bow = BoW.transform(test_text)
future_bow = BoW.transform(future_text)

In [23]:
# save output
save_npz("../data/curated/baseline/train_tfidf_x.npz", train_tfidf)
save_npz("../data/curated/baseline/val_tfidf_x.npz", val_tfidf)
save_npz("../data/curated/baseline/test_tfidf_x.npz", test_tfidf)
save_npz('../data/curated/baseline/future_tfidf_x.npz', future_tfidf)

save_npz("../data/curated/baseline/train_bow_x.npz", train_bow)
save_npz("../data/curated/baseline/val_bow_x.npz", val_bow)
save_npz("../data/curated/baseline/test_bow_x.npz", test_bow)
save_npz('../data/curated/baseline/future_bow_x.npz', future_bow)

with open("../data/curated/baseline/train_y.pkl", "wb") as f:
    pickle.dump(train_label, f)
with open("../data/curated/baseline/val_y.pkl", "wb") as f:
    pickle.dump(val_label, f)
with open("../data/curated/baseline/test_y.pkl", "wb") as f:
    pickle.dump(test_label, f)

---
Feature Importance

In [24]:
def get_feature_importance(model, gap:int=0):

    """
        Gets sequential feature importance of model and returns it in a list format

        Input:
            - model: model which has been fitted - must have feature_importances_ and feature_names_in_ attributes
            - gap: int - how many features to include in each iteration
    """

    ordered_feature_importance = {}

    feature_importance = list(model.feature_importances_)
    feature_importance_list = [(i, feature_importance[i]) for i in range(len(feature_importance))]
    feature_importance_list.sort(key=lambda x: x[1], reverse=True)
    feature_importance_list = [x for x in feature_importance_list if x[1] > 0]

    curr = []
    score = 0
    for i in tqdm(range(len(feature_importance_list))):

        curr.append(feature_importance_list[i][0])
        score += feature_importance_list[i][1]


        if (i+1) % gap == 0:
            ordered_feature_importance[tuple(curr)] = score
    
    if (i+1) % gap != 0: # account for last combo (if it doesn't fit into the gap)
        ordered_feature_importance[tuple(curr)] = score

    return ordered_feature_importance

In [25]:
tfidf_xgb = XGBClassifier(max_depth = 12, 
                            random_state = SEED, 
                            n_estimators = 100,
                            colsample_bytree = 0.75,
                            subsample = 0.75
                            )

tfidf_xgb.fit(train_tfidf, train_label)

xgb_tfidf_feature_importance_ordering = get_feature_importance(tfidf_xgb, 10) 

# export
with open(f'../models/xgb_tfidf_feature_importance_ordering_{EXPERIMENT}.pickle', 'wb') as f:
    pickle.dump(xgb_tfidf_feature_importance_ordering, f)

100%|██████████| 924/924 [00:00<00:00, 209194.48it/s]


In [26]:
bow_xgb = XGBClassifier(max_depth = 12, 
                            random_state = SEED, 
                            n_estimators = 100,
                            colsample_bytree = 0.75,
                            subsample = 0.75
                            )

bow_xgb.fit(train_bow, train_label)

xgb_bow_feature_importance_ordering = get_feature_importance(bow_xgb, 10) 

# export
with open(f'../models/xgb_bow_feature_importance_ordering_{EXPERIMENT}.pickle', 'wb') as f:
    pickle.dump(xgb_bow_feature_importance_ordering, f)

100%|██████████| 1126/1126 [00:00<00:00, 767433.59it/s]


---
## Inference 1

In [40]:
with open('../models/tmp_models/jiaochengb_lgbc_xgb_tfidf_Baseline.pickle', 'rb') as f:
    clf = pickle.load(f)
with open('../models/xgb_tfidf_feature_importance_ordering_Baseline.pickle', 'rb') as f:
    feature_importance_ordering = pickle.load(f)

print(accuracy_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[86]])), 
      f1_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[86]]), average='binary'), 
    balanced_accuracy_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[86]])))
print(accuracy_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[86]])), 
      f1_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[86]]), average='binary'), 
    balanced_accuracy_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[86]])))
print(accuracy_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[86]])), 
      f1_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[86]]), average='binary'),
    balanced_accuracy_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[86]])))

future_predict = clf.predict(future_tfidf[:, list(feature_importance_ordering.keys())[86]])
predictions = pd.DataFrame({'id': range(len(future_predict)), 'class': future_predict})
predictions.to_csv('../predictions/jiaochengb_lgbc_xgb_tfidf_Baseline.csv', index=False)

0.9996825396825397 0.9992852037169406 0.9992857142857143
0.8688888888888889 0.6685393258426966 0.7710714285714286
0.87 0.6666666666666666 0.7682142857142857


In [10]:
with open('../models/tmp_models/jiaochengb-balaccu_lgbc_xgb_tfidf_Baseline.pickle', 'rb') as f:
    clf = pickle.load(f)
with open('../models/xgb_tfidf_feature_importance_ordering_Baseline.pickle', 'rb') as f:
    feature_importance_ordering = pickle.load(f)

print(accuracy_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[92]])), 
      f1_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[92]]), average='binary'), 
    balanced_accuracy_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[92]])))
print(accuracy_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[92]])), 
      f1_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[92]]), average='binary'), 
    balanced_accuracy_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[92]])))
print(accuracy_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[92]])), 
      f1_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[92]]), average='binary'),
    balanced_accuracy_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[92]])))

future_predict = clf.predict(future_tfidf[:, list(feature_importance_ordering.keys())[92]])
predictions = pd.DataFrame({'id': range(len(future_predict)), 'class': future_predict})
predictions.to_csv('../predictions/jiaochengb-balaccu_lgbc_xgb_tfidf_Baseline.csv', index=False)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


1.0 1.0 1.0
0.8618518518518519 0.6510757717492984 0.7611904761904762
0.8614814814814815 0.638996138996139 0.7508333333333332


In [11]:
with open('../models/tmp_models/jiaochengb-f1_lgbc_xgb_tfidf_Baseline.pickle', 'rb') as f:
    clf = pickle.load(f)
with open('../models/xgb_tfidf_feature_importance_ordering_Baseline.pickle', 'rb') as f:
    feature_importance_ordering = pickle.load(f)

print(accuracy_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[83]])), 
      f1_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[83]]), average='binary'), 
    balanced_accuracy_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[83]])))
print(accuracy_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[83]])), 
      f1_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[83]]), average='binary'), 
    balanced_accuracy_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[83]])))
print(accuracy_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[83]])), 
      f1_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[83]]), average='binary'),
    balanced_accuracy_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[83]])))

future_predict = clf.predict(future_tfidf[:, list(feature_importance_ordering.keys())[83]])
predictions = pd.DataFrame({'id': range(len(future_predict)), 'class': future_predict})
predictions.to_csv('../predictions/jiaochengb-f1_lgbc_xgb_tfidf_Baseline.csv', index=False)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


1.0 1.0 1.0
0.8629629629629629 0.6469465648854962 0.756547619047619
0.8622222222222222 0.6381322957198443 0.7495238095238095


In [14]:
with open('../models/tmp_models/jiaochengb-balaccu_lgbc_xgb_tfidf_Baseline.pickle', 'rb') as f:
    clf = pickle.load(f)
# with open('../models/xgb_tfidf_feature_importance_ordering_Baseline.pickle', 'rb') as f:
#     feature_importance_ordering = pickle.load(f)

val_pred = clf.predict(val_tfidf)
test_pred = clf.predict(test_tfidf)

val_dom1_pred = [val_pred[i] for i in range(len(val_pred)) if val_domain[i] == 1]
val_dom2_pred = [val_pred[i] for i in range(len(val_pred)) if val_domain[i] == 2]
val_dom1_true = [val_label[i] for i in range(len(val_label)) if val_domain[i] == 1]
val_dom2_true = [val_label[i] for i in range(len(val_label)) if val_domain[i] == 2]

test_dom1_pred = [test_pred[i] for i in range(len(test_pred)) if test_domain[i] == 1]
test_dom2_pred = [test_pred[i] for i in range(len(test_pred)) if test_domain[i] == 2]
test_dom1_true = [test_label[i] for i in range(len(test_label)) if test_domain[i] == 1]
test_dom2_true = [test_label[i] for i in range(len(test_label)) if test_domain[i] == 2]


print(balanced_accuracy_score(val_dom1_true, val_dom1_pred), balanced_accuracy_score(val_dom2_true, val_dom2_pred), \
      (balanced_accuracy_score(val_dom1_true, val_dom1_pred) + balanced_accuracy_score(val_dom2_true, val_dom2_pred)) / 2, balanced_accuracy_score(val_label, val_pred))
print(balanced_accuracy_score(test_dom1_true, test_dom1_pred), balanced_accuracy_score(test_dom2_true, test_dom2_pred), \
        (balanced_accuracy_score(test_dom1_true, test_dom1_pred) + balanced_accuracy_score(test_dom2_true, test_dom2_pred)) / 2, balanced_accuracy_score(test_label, test_pred))

# future_predict = clf.predict(future_tfidf)
# predictions = pd.DataFrame({'id': range(len(future_predict)), 'class': future_predict})
# predictions.to_csv('../predictions/jiaochengb-balaccu_lgbc_xgb_tfidf_Baseline.csv', index=False)

0.7746666666666666 0.6202898550724638 0.6974782608695652 0.7657142857142858
0.7786666666666666 0.6114975845410628 0.6950821256038647 0.7653571428571428
