In [1]:
# Utility 
import os
import pickle
import toml

# Data manipulation
import pandas as pd
import numpy as np

# Machine learning
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

#Additional
from features.preprocessing import dataset_preprocessor, df_preprocessor
from features.fake_news_classifier import TFIDFTransform

In [2]:
CONFIG_PATH = "../config.toml"
config = toml.load(CONFIG_PATH)

global_cfg = config["global"]
headvec_cfg = config["headline_vectorizer"]
bodyvec_cfg = config["body_vectorizer"]
rel_params = config["rel_params"]
cls_params = config["cls_params"]

In [3]:
bodies = pd.read_csv("../data/bodies.csv", )
stances = pd.read_csv("../data/stances.csv")

In [4]:
df = df_preprocessor(bodies, stances)
X, y = df[["article_body", "headline"]], df[["stance", "relation"]]

In [5]:
label_encoder = LabelEncoder()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = global_cfg["test_size"], random_state=global_cfg["random_state"])
X_train_rel, X_train_cls, y_train_rel, y_train_cls, train_cls_indices = dataset_preprocessor(X_train, y_train, label_encoder)
X_test_rel, X_test_cls, y_test_rel, y_test_cls, test_cls_indices = dataset_preprocessor(X_test, y_test, label_encoder)

In [6]:
headline_vectorizer = TfidfVectorizer(analyzer=headvec_cfg["analyzer"], stop_words=headvec_cfg["stop_words"], 
                                      lowercase=headvec_cfg["lowercase"], token_pattern=r'\w+')
body_vectorizer = TfidfVectorizer(analyzer=bodyvec_cfg["analyzer"], stop_words=bodyvec_cfg["stop_words"], 
                                  lowercase=bodyvec_cfg["lowercase"], token_pattern=r'\w+')
tfidf_transform = TFIDFTransform(headline_vectorizer=headline_vectorizer, body_vectorizer=body_vectorizer)

In [7]:
X_train_rel_vec = tfidf_transform.fit_transform(X_train_rel)
X_train_cls_vec = tfidf_transform.transform(X_train_cls)

D_train_rel = xgb.DMatrix(X_train_rel_vec, label=y_train_rel)
D_train_cls = xgb.DMatrix(X_train_cls_vec, label=y_train_cls)

In [8]:
rel_classifier = xgb.train(rel_params, D_train_rel, global_cfg["steps"])
cls_classifier = xgb.train(cls_params, D_train_cls, global_cfg["steps"])

rel_classifier.save_model("../models/rel_classifier.json")
cls_classifier.save_model("../models/cls_classifier.json")
    
os.makedirs("../models/", exist_ok=True)

with open("../models/transform.pkl", "wb") as f:
    pickle.dump(tfidf_transform, f)