In [1]:
# Utility 
import os
import pickle
import toml

# Data manipulation
import pandas as pd
import numpy as np

# Machine learning
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

#Additional
from features.preprocessing import dataset_preprocessor, df_preprocessor
from features.fake_news_classifier import TFIDFTransform

In [2]:
CONFIG_PATH = "../config.toml"
config = toml.load(CONFIG_PATH)

global_cfg = config["global"]
headvec_cfg = config["headline_vectorizer"]
bodyvec_cfg = config["body_vectorizer"]
rel_params = config["rel_params"]
cls_params = config["cls_params"]

In [3]:
bodies = pd.read_csv("../data/bodies.csv", )
stances = pd.read_csv("../data/stances.csv")

In [4]:
df = df_preprocessor(bodies, stances)
X, y = df[["article_body", "headline"]], df[["stance", "relation"]]

In [5]:
label_encoder = LabelEncoder()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = global_cfg["test_size"], random_state=global_cfg["random_state"])
X_train_rel, X_train_cls, y_train_rel, y_train_cls, train_cls_indices = dataset_preprocessor(X_train, y_train, label_encoder)
X_test_rel, X_test_cls, y_test_rel, y_test_cls, test_cls_indices = dataset_preprocessor(X_test, y_test, label_encoder)

In [6]:
headline_vectorizer = TfidfVectorizer(analyzer=headvec_cfg["analyzer"], stop_words=headvec_cfg["stop_words"], 
                                      lowercase=headvec_cfg["lowercase"], token_pattern=r'\w+')
body_vectorizer = TfidfVectorizer(analyzer=bodyvec_cfg["analyzer"], stop_words=bodyvec_cfg["stop_words"], 
                                  lowercase=bodyvec_cfg["lowercase"], token_pattern=r'\w+')
tfidf_transform = TFIDFTransform(headline_vectorizer=headline_vectorizer, body_vectorizer=body_vectorizer)

In [7]:
X_train_rel_vec = tfidf_transform.fit_transform(X_train_rel)
X_train_cls_vec = tfidf_transform.transform(X_train_cls)

D_train_rel = xgb.DMatrix(X_train_rel_vec, label=y_train_rel)
D_train_cls = xgb.DMatrix(X_train_cls_vec, label=y_train_cls)

In [8]:
RANDOM_PARAM_MULTIPLIER = 1 # Multiply base number of result configs

# All param lists should be equal length
# Base number of param configs is equal to the length of param lists
depth = [6, 12, 20, 24]
l_rate = [0.05, 0.1, 0.2, 0.3]
colsample = [0.7, 0.8, 0.9, 1.0]
subsample = [0.7, 0.8, 0.9, 1.0]

array_rel_params = []
array_cls_params = []
for i in range(len(depth) * RANDOM_PARAM_MULTIPLIER):
    np.random.seed(i)
    array_rel_params.append(
        {
            "eta": np.random.choice(l_rate),
            "max_depth": np.random.choice(depth),
            "colsample_bytree": np.random.choice(colsample),
            "subsample": np.random.choice(subsample)
        }
    )

for i in range(len(depth) * RANDOM_PARAM_MULTIPLIER):
    np.random.seed(i+1)
    array_cls_params.append(
        {
            "eta": np.random.choice(l_rate),
            "max_depth": np.random.choice(depth),
            "colsample_bytree": np.random.choice(colsample),
            "subsample": np.random.choice(subsample),
            "num_class": 3
        }
    )

In [9]:
array_rel_params

[{'eta': 0.05, 'max_depth': 24, 'colsample_bytree': 0.8, 'subsample': 0.7},
 {'eta': 0.1, 'max_depth': 24, 'colsample_bytree': 0.7, 'subsample': 0.7},
 {'eta': 0.05, 'max_depth': 24, 'colsample_bytree': 0.8, 'subsample': 0.7},
 {'eta': 0.2, 'max_depth': 6, 'colsample_bytree': 0.8, 'subsample': 1.0}]

In [11]:
array_cls_params

[{'eta': 0.1,
  'max_depth': 24,
  'colsample_bytree': 0.7,
  'subsample': 0.7,
  'num_class': 3},
 {'eta': 0.05,
  'max_depth': 24,
  'colsample_bytree': 0.8,
  'subsample': 0.7,
  'num_class': 3},
 {'eta': 0.2,
  'max_depth': 6,
  'colsample_bytree': 0.8,
  'subsample': 1.0,
  'num_class': 3},
 {'eta': 0.2,
  'max_depth': 20,
  'colsample_bytree': 1.0,
  'subsample': 0.8,
  'num_class': 3}]

In [10]:
# Simple RandomSearchCV
if global_cfg["train_multiple_models"]:
    best_rel_test_mean_rmse = 9999.0
    best_cls_test_mean_mlogloss = 9999.0

    best_rel_params = None
    best_cls_params = None

    for rel_params in array_rel_params:
        rel_classifier = xgb.cv(rel_params, D_train_rel, num_boost_round=global_cfg["steps"],
                                nfold=5, metrics="rmse", seed=1, early_stopping_rounds=5)

        min_rel_rmse = rel_classifier["test-rmse-mean"].min()
        
        if min_rel_rmse < best_rel_test_mean_rmse:
            best_rel_test_mean_rmse = min_rel_rmse
            best_rel_params = rel_params

        del rel_classifier

    for cls_params in array_cls_params:
        cls_classifier = xgb.cv(cls_params, D_train_cls, num_boost_round=global_cfg["steps"],
                                nfold=5, metrics="mlogloss", seed=1, early_stopping_rounds=5)

        min_cls_mlogloss = cls_classifier["test-mlogloss-mean"].min()
        if min_cls_mlogloss < best_cls_test_mean_mlogloss:
            best_cls_test_mean_mlogloss = min_cls_mlogloss
            best_cls_params = cls_params

        del cls_classifier


    rel_classifier = xgb.train(best_rel_params, D_train_rel, global_cfg["steps"])
    cls_classifier = xgb.train(best_cls_params, D_train_cls, global_cfg["steps"])

    rel_classifier.save_model("../models/best_rel_classifier.json")
    cls_classifier.save_model("../models/best_cls_classifier.json")
        
    os.makedirs("../models/", exist_ok=True)

    with open("../models/transform.pkl", "wb") as f:
        pickle.dump(tfidf_transform, f)

else:
    rel_classifier = xgb.train(rel_params, D_train_rel, global_cfg["steps"])
    cls_classifier = xgb.train(cls_params, D_train_cls, global_cfg["steps"])

    rel_classifier.save_model("../models/rel_classifier.json")
    cls_classifier.save_model("../models/cls_classifier.json")
        
    os.makedirs("../models/", exist_ok=True)

    with open("../models/transform.pkl", "wb") as f:
        pickle.dump(tfidf_transform, f)

del rel_classifier
del cls_classifier