# Prepare data Basic

In [64]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from collections import defaultdict as dd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score
from sklearn.utils import resample, shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
from scipy.sparse import csr_matrix, hstack


import pickle
import scipy

from scipy.sparse import save_npz, load_npz

In [65]:
data1 = []
with open('../data/raw/domain1_train_data.json', 'r') as f:
    for line in f:
        data1.append(json.loads(line))

data2 = []
with open('../data/raw/domain2_train_data.json', 'r') as f:
    for line in f:
        data2.append(json.loads(line))

data_test = []
with open('../data/raw/test_data.json', 'r') as f:
    for line in f:
        data_test.append(json.loads(line))

In [66]:
SEED = 2608

In [67]:
EXPERIMENT = 'global_with_domain'

---
# Train Val Test Split

In [68]:
# create domain labels for data
for i in range(len(data1)):
    data1[i]['domain'] = 1
for i in range(len(data2)):
    data2[i]['domain'] = 2

In [69]:
# Train Val Test Split

# get labels for stratification
label1 = [instance['label'] for instance in data1]
label2 = [instance['label'] for instance in data2]

train_ix_1, val_test_ix_1 = train_test_split(range(len(data1)), test_size=0.3, random_state=SEED, stratify = label1)
train_ix_2, val_test_ix_2 = train_test_split(range(len(data2)), test_size=0.3, random_state=SEED, stratify = label2)
val_ix_1, test_ix_1 = train_test_split(val_test_ix_1, test_size=0.5, random_state=SEED, stratify = [data1[i]['label'] for i in val_test_ix_1])
val_ix_2, test_ix_2 = train_test_split(val_test_ix_2, test_size=0.5, random_state=SEED, stratify = [data2[i]['label'] for i in val_test_ix_2])

# split data according to the index from train_test_split
train_data_1 = [data1[i] for i in train_ix_1]
val_data_1 = [data1[i] for i in val_ix_1]
test_data_1 = [data1[i] for i in test_ix_1]
train_data_2 = [data2[i] for i in train_ix_2]
val_data_2 = [data2[i] for i in val_ix_2]
test_data_2 = [data2[i] for i in test_ix_2]

In [70]:
# # upsample the minority class (label 1) for domain 2 train data
# train_data2_label0 = [d for d in train_data_2 if d['label'] == 0]
# train_data2_label1 = [d for d in train_data_2 if d['label'] == 1]
# train_data2_label1_upsampled = resample(train_data2_label1, replace=True, n_samples=len(train_data2_label0), random_state=SEED)

# # Merge and shuffle the data back together
# train_data2 = train_data2_label0 + train_data2_label1_upsampled
# train_data2 = shuffle(train_data2, random_state=SEED)

# len(train_data2)

In [71]:
# merge data from two domains together
train_data = train_data_1 + train_data_2
val_data = val_data_1 + val_data_2
test_data = test_data_1 + test_data_2

In [72]:
# get text, label, domain and id for each split
train_text = [' '.join([str(encode) for encode in instance['text']]) for instance in train_data]
val_text = [' '.join([str(encode) for encode in instance['text']]) for instance in val_data]
test_text = [' '.join([str(encode) for encode in instance['text']]) for instance in test_data]
future_text = [' '.join([str(encode) for encode in instance['text']]) for instance in data_test]

train_label = [instance['label'] for instance in train_data]
val_label = [instance['label'] for instance in val_data]
test_label = [instance['label'] for instance in test_data]

train_domain = [instance['domain'] for instance in train_data]
val_domain = [instance['domain'] for instance in val_data]
test_domain = [instance['domain'] for instance in test_data]

train_id = list(range(len(train_data)))
val_id = list(range(len(val_data)))
test_id = list(range(len(test_data)))


In [73]:
# TFIDF
TFIDF = TfidfVectorizer(max_features=83581) # from EDA

TFIDF.fit(train_text)
train_tfidf = TFIDF.transform(train_text)
val_tfidf = TFIDF.transform(val_text)
test_tfidf = TFIDF.transform(test_text)
future_tfidf = TFIDF.transform(future_text)

In [74]:
train_tfidf

<12600x68213 sparse matrix of type '<class 'numpy.float64'>'
	with 1590593 stored elements in Compressed Sparse Row format>

In [75]:
csr_matrix(train_domain).transpose()


<12600x1 sparse matrix of type '<class 'numpy.int64'>'
	with 12600 stored elements in Compressed Sparse Column format>

In [76]:
train_tfidf = hstack([train_tfidf, csr_matrix(train_domain).transpose()])
val_tfidf = hstack([val_tfidf, csr_matrix(val_domain).transpose()])
test_tfidf = hstack([test_tfidf, csr_matrix(test_domain).transpose()])

In [77]:
# Bag of Words
# BoW = CountVectorizer(max_features=83581) # from EDA

# BoW.fit(train_text)
# train_bow = BoW.transform(train_text)
# val_bow = BoW.transform(val_text)
# test_bow = BoW.transform(test_text)
# future_bow = BoW.transform(future_text)

In [78]:
# save output
save_npz(f"../data/curated/baseline/train_tfidf_{EXPERIMENT}_x.npz", train_tfidf)
save_npz(f"../data/curated/baseline/val_tfidf_{EXPERIMENT}_x.npz", val_tfidf)
save_npz(f"../data/curated/baseline/test_tfidf_{EXPERIMENT}_x.npz", test_tfidf)
save_npz(f'../data/curated/baseline/future_tfidf_{EXPERIMENT}_x.npz', future_tfidf)

# save_npz("../data/curated/baseline/train_bow_oversample_x.npz", train_bow)
# save_npz("../data/curated/baseline/val_bow_x.npz", val_bow)
# save_npz("../data/curated/baseline/test_bow_x.npz", test_bow)
# save_npz('../data/curated/baseline/future_bow_x.npz', future_bow)

with open(f"../data/curated/baseline/train_{EXPERIMENT}_y.pkl", "wb") as f:
    pickle.dump(train_label, f)
with open(f"../data/curated/baseline/val_{EXPERIMENT}_y.pkl", "wb") as f:
    pickle.dump(val_label, f)
with open(f"../data/curated/baseline/test_{EXPERIMENT}_y.pkl", "wb") as f:
    pickle.dump(test_label, f)

---
Feature Importance

In [79]:
def get_feature_importance(model, gap:int=0):

    """
        Gets sequential feature importance of model and returns it in a list format

        Input:
            - model: model which has been fitted - must have feature_importances_ and feature_names_in_ attributes
            - gap: int - how many features to include in each iteration
    """

    ordered_feature_importance = {}

    feature_importance = list(model.feature_importances_)
    feature_importance_list = [(i, feature_importance[i]) for i in range(len(feature_importance))]
    feature_importance_list.sort(key=lambda x: x[1], reverse=True)
    feature_importance_list = [x for x in feature_importance_list if x[1] > 0]

    curr = []
    score = 0
    for i in tqdm(range(len(feature_importance_list))):

        curr.append(feature_importance_list[i][0])
        score += feature_importance_list[i][1]


        if (i+1) % gap == 0:
            ordered_feature_importance[tuple(curr)] = score
    
    if (i+1) % gap != 0: # account for last combo (if it doesn't fit into the gap)
        ordered_feature_importance[tuple(curr)] = score

    return ordered_feature_importance

In [80]:
tfidf_xgb = XGBClassifier(max_depth = 12, 
                            random_state = SEED, 
                            n_estimators = 100,
                            colsample_bytree = 0.75,
                            subsample = 0.75
                            )

tfidf_xgb.fit(train_tfidf, train_label)

xgb_tfidf_feature_importance_ordering = get_feature_importance(tfidf_xgb, 10) 

# export
with open(f'../models/xgb_tfidf_feature_importance_ordering_{EXPERIMENT}.pickle', 'wb') as f:
    pickle.dump(xgb_tfidf_feature_importance_ordering, f)

100%|██████████| 955/955 [00:00<00:00, 321861.01it/s]


In [81]:
# bow_xgb = XGBClassifier(max_depth = 12, 
#                             random_state = SEED, 
#                             n_estimators = 100,
#                             colsample_bytree = 0.75,
#                             subsample = 0.75
#                             )

# bow_xgb.fit(train_bow, train_label)

# xgb_bow_feature_importance_ordering = get_feature_importance(bow_xgb, 10) 

# # export
# with open(f'../models/xgb_bow_feature_importance_ordering_{EXPERIMENT}.pickle', 'wb') as f:
#     pickle.dump(xgb_bow_feature_importance_ordering, f)

---
## Inference 1

In [82]:
len(train_tfidf[:, list(feature_importance_ordering.keys())])

NameError: name 'feature_importance_ordering' is not defined

In [None]:
with open(f'../models/tmp_models/jiaochengb-balaccu_lgbc_xgb_tfidf_{EXPERIMENT}_Baseline.pickle', 'rb') as f:
    clf = pickle.load(f)
with open(f'../models/xgb_tfidf_feature_importance_ordering_{EXPERIMENT}.pickle', 'rb') as f:
    feature_importance_ordering = pickle.load(f)

NUM_FEATURE = 92

print(accuracy_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[NUM_FEATURE]])), 
      f1_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[NUM_FEATURE]]), average='binary'), 
    balanced_accuracy_score(train_label, clf.predict(train_tfidf[:, list(feature_importance_ordering.keys())[NUM_FEATURE]])))
print(accuracy_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[NUM_FEATURE]])), 
      f1_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[NUM_FEATURE]]), average='binary'), 
    balanced_accuracy_score(val_label, clf.predict(val_tfidf[:, list(feature_importance_ordering.keys())[NUM_FEATURE]])))
print(accuracy_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[NUM_FEATURE]])), 
      f1_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[NUM_FEATURE]]), average='binary'),
    balanced_accuracy_score(test_label, clf.predict(test_tfidf[:, list(feature_importance_ordering.keys())[NUM_FEATURE]])))

future_predict = clf.predict(future_tfidf[:, list(feature_importance_ordering.keys())[NUM_FEATURE]])
predictions = pd.DataFrame({'id': range(len(future_predict)), 'class': future_predict})
predictions.to_csv(f'../predictions/jiaochengb_lgbc_xgb_tfidf_{EXPERIMENT}_Baseline.csv', index=False)

ValueError: Number of features of the model must match the input. Model n_features_ is 68213 and input n_features is 924

In [None]:
with open(f'../models/tmp_models/jiaochengb-balaccu_lgbc_xgb_tfidf_{EXPERIMENT}_Baseline.pickle', 'rb') as f:
    clf = pickle.load(f)
with open(f'../models/xgb_tfidf_feature_importance_ordering_{EXPERIMENT}.pickle', 'rb') as f:
    feature_importance_ordering = pickle.load(f)

NUM_FEATURE = 92

print(accuracy_score(train_label, clf.predict(train_tfidf[:])), 
      f1_score(train_label, clf.predict(train_tfidf[:]), average='binary'), 
    balanced_accuracy_score(train_label, clf.predict(train_tfidf[:])))
print(accuracy_score(val_label, clf.predict(val_tfidf[:])), 
      f1_score(val_label, clf.predict(val_tfidf[:]), average='binary'), 
    balanced_accuracy_score(val_label, clf.predict(val_tfidf[:])))
print(accuracy_score(test_label, clf.predict(test_tfidf[:])), 
      f1_score(test_label, clf.predict(test_tfidf[:]), average='binary'),
    balanced_accuracy_score(test_label, clf.predict(test_tfidf[:])))

future_predict = clf.predict(future_tfidf[:])
predictions = pd.DataFrame({'id': range(len(future_predict)), 'class': future_predict})
predictions.to_csv(f'../predictions/jiaochengb_lgbc_xgb_tfidf_{EXPERIMENT}_Baseline.csv', index=False)

1.0 1.0 1.0
0.8681481481481481 0.6641509433962265 0.7676190476190476
0.8688888888888889 0.6685393258426966 0.7710714285714286


In [None]:
with open(f'../models/tmp_models/jiaochengb-balaccu_lgbc_xgb_tfidf_{EXPERIMENT}_Baseline.pickle', 'rb') as f:
    clf = pickle.load(f)
with open(f'../models/xgb_tfidf_feature_importance_ordering_{EXPERIMENT}.pickle', 'rb') as f:
    feature_importance_ordering = pickle.load(f)

val_pred = clf.predict(val_tfidf[:])
test_pred = clf.predict(test_tfidf[:])

val_dom1_pred = [val_pred[i] for i in range(len(val_pred)) if val_domain[i] == 1]
val_dom2_pred = [val_pred[i] for i in range(len(val_pred)) if val_domain[i] == 2]
val_dom1_true = [val_label[i] for i in range(len(val_label)) if val_domain[i] == 1]
val_dom2_true = [val_label[i] for i in range(len(val_label)) if val_domain[i] == 2]

test_dom1_pred = [test_pred[i] for i in range(len(test_pred)) if test_domain[i] == 1]
test_dom2_pred = [test_pred[i] for i in range(len(test_pred)) if test_domain[i] == 2]
test_dom1_true = [test_label[i] for i in range(len(test_label)) if test_domain[i] == 1]
test_dom2_true = [test_label[i] for i in range(len(test_label)) if test_domain[i] == 2]


print(balanced_accuracy_score(val_dom1_true, val_dom1_pred), balanced_accuracy_score(val_dom2_true, val_dom2_pred), \
      (balanced_accuracy_score(val_dom1_true, val_dom1_pred) + balanced_accuracy_score(val_dom2_true, val_dom2_pred)) / 2, balanced_accuracy_score(val_label, val_pred))
print(balanced_accuracy_score(test_dom1_true, test_dom1_pred), balanced_accuracy_score(test_dom2_true, test_dom2_pred), \
        (balanced_accuracy_score(test_dom1_true, test_dom1_pred) + balanced_accuracy_score(test_dom2_true, test_dom2_pred)) / 2, balanced_accuracy_score(test_label, test_pred))

future_predict = clf.predict(future_tfidf[:])
predictions = pd.DataFrame({'id': range(len(future_predict)), 'class': future_predict})
predictions.to_csv('../predictions/jiaochengb-balaccu_lgbc_xgb_tfidf_Baseline.csv', index=False)

0.7706666666666666 0.6413526570048309 0.7060096618357488 0.7676190476190476
0.7706666666666666 0.6322705314009662 0.7014685990338164 0.7710714285714286


In [None]:
df = pd.read_csv(f"../models/tuning_results/jiaochengb-balaccu_lgbc_xgb_tfidf_{EXPERIMENT}_Baseline.csv")
df['Val accu'].max()

0.864444