# Prepare data (SMOTE global) and feature select

In [69]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from collections import defaultdict as dd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score
import pandas as pd
from imblearn.over_sampling import SMOTE

import pickle
import scipy

from scipy.sparse import save_npz, load_npz

In [70]:
data1 = []
with open('../data/raw/domain1_train_data.json', 'r') as f:
    for line in f:
        data1.append(json.loads(line))

data2 = []
with open('../data/raw/domain2_train_data.json', 'r') as f:
    for line in f:
        data2.append(json.loads(line))

data_test = []
with open('../data/raw/test_data.json', 'r') as f:
    for line in f:
        data_test.append(json.loads(line))

In [71]:
SEED = 2608

In [72]:
EXPERIMENT = 'SMOTE'

---
# Train Val Test Split

In [73]:
# create domain labels for data
for i in range(len(data1)):
    data1[i]['domain'] = 1
for i in range(len(data2)):
    data2[i]['domain'] = 2

In [74]:
# Train Val Test Split

# get labels for stratification
label1 = [instance['label'] for instance in data1]
label2 = [instance['label'] for instance in data2]

train_ix_1, val_test_ix_1 = train_test_split(range(len(data1)), test_size=0.3, random_state=SEED, stratify = label1)
train_ix_2, val_test_ix_2 = train_test_split(range(len(data2)), test_size=0.3, random_state=SEED, stratify = label2)
val_ix_1, test_ix_1 = train_test_split(val_test_ix_1, test_size=0.5, random_state=SEED, stratify = [data1[i]['label'] for i in val_test_ix_1])
val_ix_2, test_ix_2 = train_test_split(val_test_ix_2, test_size=0.5, random_state=SEED, stratify = [data2[i]['label'] for i in val_test_ix_2])

# split data according to the index from train_test_split
train_data_1 = [data1[i] for i in train_ix_1]
val_data_1 = [data1[i] for i in val_ix_1]
test_data_1 = [data1[i] for i in test_ix_1]
train_data_2 = [data2[i] for i in train_ix_2]
val_data_2 = [data2[i] for i in val_ix_2]
test_data_2 = [data2[i] for i in test_ix_2]

In [75]:
# merge data from two domains together
train_data = train_data_1 + train_data_2
val_data = val_data_1 + val_data_2
test_data = test_data_1 + test_data_2

In [76]:
# get text, label, domain and id for each split
train_text = [' '.join([str(encode) for encode in instance['text']]) for instance in train_data]
val_text = [' '.join([str(encode) for encode in instance['text']]) for instance in val_data]
test_text = [' '.join([str(encode) for encode in instance['text']]) for instance in test_data]
future_text = [' '.join([str(encode) for encode in instance['text']]) for instance in data_test]

train_label = [instance['label'] for instance in train_data]
val_label = [instance['label'] for instance in val_data]
test_label = [instance['label'] for instance in test_data]

train_domain = [instance['domain'] for instance in train_data]
val_domain = [instance['domain'] for instance in val_data]
test_domain = [instance['domain'] for instance in test_data]

train_id = list(range(len(train_data)))
val_id = list(range(len(val_data)))
test_id = list(range(len(test_data)))

In [77]:
# TFIDF
TFIDF = TfidfVectorizer(max_features=83581) # from EDA

TFIDF.fit(train_text)
train_tfidf = TFIDF.transform(train_text)
val_tfidf = TFIDF.transform(val_text)
test_tfidf = TFIDF.transform(test_text)
future_tfidf = TFIDF.transform(future_text)

In [78]:
# Bag of Words
# BoW = CountVectorizer(max_features=83581) # from EDA

# BoW.fit(train_text)
# train_bow = BoW.transform(train_text)
# val_bow = BoW.transform(val_text)
# test_bow = BoW.transform(test_text)
# future_bow = BoW.transform(future_text)

In [79]:
# upsample the minority class (label 1) for domain 2 train data

# Use SMOTE to upsample data
sm = SMOTE(random_state=SEED)
train_tfidf, train_label = sm.fit_resample(train_tfidf, train_label)
# train_bow, train_label = sm.fit_resample(train_bow, train_label)

train_tfidf

<19600x68213 sparse matrix of type '<class 'numpy.float64'>'
	with 2916243 stored elements in Compressed Sparse Row format>

In [85]:
# save output
save_npz("../data/curated/baseline/train_tfidf_smote_x.npz", train_tfidf)
save_npz("../data/curated/baseline/val_tfidf_x.npz", val_tfidf)
save_npz("../data/curated/baseline/test_tfidf_x.npz", test_tfidf)
save_npz('../data/curated/baseline/future_tfidf_x.npz', future_tfidf)

# save_npz("../data/curated/baseline/train_bow_smote_x.npz", train_bow)
# save_npz("../data/curated/baseline/val_bow_x.npz", val_bow)
# save_npz("../data/curated/baseline/test_bow_x.npz", test_bow)
# save_npz('../data/curated/baseline/future_bow_x.npz', future_bow)

with open("../data/curated/baseline/train_smote_y.pkl", "wb") as f:
    pickle.dump(train_label, f)
with open("../data/curated/baseline/val_y.pkl", "wb") as f:
    pickle.dump(val_label, f)
with open("../data/curated/baseline/test_y.pkl", "wb") as f:
    pickle.dump(test_label, f)

---
Feature Importance

In [81]:
def get_feature_importance(model, gap:int=0):

    """
        Gets sequential feature importance of model and returns it in a list format

        Input:
            - model: model which has been fitted - must have feature_importances_ and feature_names_in_ attributes
            - gap: int - how many features to include in each iteration
    """

    ordered_feature_importance = {}

    feature_importance = list(model.feature_importances_)
    feature_importance_list = [(i, feature_importance[i]) for i in range(len(feature_importance))]
    feature_importance_list.sort(key=lambda x: x[1], reverse=True)
    feature_importance_list = [x for x in feature_importance_list if x[1] > 0]

    curr = []
    score = 0
    for i in tqdm(range(len(feature_importance_list))):

        curr.append(feature_importance_list[i][0])
        score += feature_importance_list[i][1]


        if (i+1) % gap == 0:
            ordered_feature_importance[tuple(curr)] = score
    
    if (i+1) % gap != 0: # account for last combo (if it doesn't fit into the gap)
        ordered_feature_importance[tuple(curr)] = score

    return ordered_feature_importance

In [82]:
tfidf_xgb = XGBClassifier(max_depth = 12, 
                            random_state = SEED, 
                            n_estimators = 100,
                            colsample_bytree = 0.75,
                            subsample = 0.75
                            )

tfidf_xgb.fit(train_tfidf, train_label)

xgb_tfidf_feature_importance_ordering = get_feature_importance(tfidf_xgb, 10) 

# export
with open(f'../models/xgb_tfidf_feature_importance_ordering_{EXPERIMENT}.pickle', 'wb') as f:
    pickle.dump(xgb_tfidf_feature_importance_ordering, f)

100%|██████████| 1201/1201 [00:00<00:00, 935269.05it/s]


In [83]:
bow_xgb = XGBClassifier(max_depth = 12, 
                            random_state = SEED, 
                            n_estimators = 100,
                            colsample_bytree = 0.75,
                            subsample = 0.75
                            )

bow_xgb.fit(train_bow, train_label)

xgb_bow_feature_importance_ordering = get_feature_importance(bow_xgb, 10) 

# export
with open(f'../models/xgb_bow_feature_importance_ordering_{EXPERIMENT}.pickle', 'wb') as f:
    pickle.dump(xgb_bow_feature_importance_ordering, f)

XGBoostError: [18:59:15] /Users/runner/work/xgboost/xgboost/src/data/data.cc:501: Check failed: this->labels.Size() % this->num_row_ == 0 (7000 vs. 0) : Incorrect size for labels.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x000000017a8f0994 dmlc::LogMessageFatal::~LogMessageFatal() + 124
  [bt] (1) 2   libxgboost.dylib                    0x000000017a9ad1fc xgboost::MetaInfo::SetInfoFromHost(xgboost::Context const&, xgboost::StringView, xgboost::Json) + 732
  [bt] (2) 3   libxgboost.dylib                    0x000000017a9acdc4 xgboost::MetaInfo::SetInfo(xgboost::Context const&, xgboost::StringView, xgboost::StringView) + 164
  [bt] (3) 4   libxgboost.dylib                    0x000000017a907688 XGDMatrixSetInfoFromInterface + 224
  [bt] (4) 5   libffi.8.dylib                      0x000000010530004c ffi_call_SYSV + 76
  [bt] (5) 6   libffi.8.dylib                      0x00000001052fd74c ffi_call_int + 1208
  [bt] (6) 7   _ctypes.cpython-311-darwin.so       0x00000001053b8bb4 _ctypes_callproc + 1208
  [bt] (7) 8   _ctypes.cpython-311-darwin.so       0x00000001053b2e34 PyCFuncPtr_call + 1188
  [bt] (8) 9   python3.11                          0x000000010498c1fc _PyObject_MakeTpCall + 332



---
## Inference 1

In [None]:
with open('../models/tmp_models/jiaochengb_lgbc_xgb_tfidf_Baseline.pickle', 'rb') as f:
    clf = pickle.load(f)
with open('../models/xgb_tfidf_feature_importance_ordering_Baseline.pickle', 'rb') as f:
    feature_importance_ordering = pickle.load(f)

print(accuracy_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[86]])), 
      f1_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[86]]), average='binary'), 
    balanced_accuracy_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[86]])))
print(accuracy_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[86]])), 
      f1_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[86]]), average='binary'), 
    balanced_accuracy_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[86]])))
print(accuracy_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[86]])), 
      f1_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[86]]), average='binary'),
    balanced_accuracy_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[86]])))

future_predict = clf.predict(future_tfidf[:, list(feature_importance_ordering.keys())[86]])
predictions = pd.DataFrame({'id': range(len(future_predict)), 'class': future_predict})
predictions.to_csv('../predictions/jiaochengb_lgbc_xgb_tfidf_Baseline.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: '../models/tmp_models/jiaochengb_lgbc_xgb_tfidf_Baseline.pickle'

In [None]:
with open('../models/tmp_models/jiaochengb-balaccu_lgbc_xgb_tfidf_Baseline.pickle', 'rb') as f:
    clf = pickle.load(f)
with open('../models/xgb_tfidf_feature_importance_ordering_Baseline.pickle', 'rb') as f:
    feature_importance_ordering = pickle.load(f)

print(accuracy_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[92]])), 
      f1_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[92]]), average='binary'), 
    balanced_accuracy_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[92]])))
print(accuracy_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[92]])), 
      f1_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[92]]), average='binary'), 
    balanced_accuracy_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[92]])))
print(accuracy_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[92]])), 
      f1_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[92]]), average='binary'),
    balanced_accuracy_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[92]])))

future_predict = clf.predict(future_tfidf[:, list(feature_importance_ordering.keys())[92]])
predictions = pd.DataFrame({'id': range(len(future_predict)), 'class': future_predict})
predictions.to_csv('../predictions/jiaochengb-balaccu_lgbc_xgb_tfidf_Baseline.csv', index=False)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


1.0 1.0 1.0
0.8618518518518519 0.6510757717492984 0.7611904761904762
0.8614814814814815 0.638996138996139 0.7508333333333332


In [None]:
with open('../models/tmp_models/jiaochengb-f1_lgbc_xgb_tfidf_Baseline.pickle', 'rb') as f:
    clf = pickle.load(f)
with open('../models/xgb_tfidf_feature_importance_ordering_Baseline.pickle', 'rb') as f:
    feature_importance_ordering = pickle.load(f)

print(accuracy_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[83]])), 
      f1_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[83]]), average='binary'), 
    balanced_accuracy_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[83]])))
print(accuracy_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[83]])), 
      f1_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[83]]), average='binary'), 
    balanced_accuracy_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[83]])))
print(accuracy_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[83]])), 
      f1_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[83]]), average='binary'),
    balanced_accuracy_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[83]])))

future_predict = clf.predict(future_tfidf[:, list(feature_importance_ordering.keys())[83]])
predictions = pd.DataFrame({'id': range(len(future_predict)), 'class': future_predict})
predictions.to_csv('../predictions/jiaochengb-f1_lgbc_xgb_tfidf_Baseline.csv', index=False)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


1.0 1.0 1.0
0.8629629629629629 0.6469465648854962 0.756547619047619
0.8622222222222222 0.6381322957198443 0.7495238095238095
