# 1. Modeling

Bron: https://iq.opengenus.org/naive-bayes-on-tf-idf-vectorized-matrix/

In [1]:
from datetime import datetime
from functools import reduce
from pprint import pprint
import json

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn import metrics

from tqdm import tqdm

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
MODEL_CHECKPOINTS_FOLDER = "../checkpoints/"

SAVE_MODEL_CHECKPOINTS = True

RANDOM_SEED = 42

In [3]:
np.random.seed(RANDOM_SEED)

## 1.1 Load data

In [4]:
df_text = pd.read_csv("../data/aapl_us_equities_news_proc_text.csv")

## 1.2 Split data

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df_text["text"],
    df_text["target"],
    stratify=df_text["target"],
    random_state=RANDOM_SEED,
)

## 1.3 Build pipeline

Bron: https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html

Bron: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

Bron: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

Bron: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

Bron: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [6]:
VECTORIZERS = [
    ("count_vec", CountVectorizer()),
    ("tfidf_vec", TfidfVectorizer()),
]

CLASSIFIERS = [
    ("ada_clf", AdaBoostClassifier(random_state=RANDOM_SEED)),
    ("knn_clf", KNeighborsClassifier()),
    ("lr_clf", LogisticRegression(random_state=RANDOM_SEED)),
    ("nb_clf", MultinomialNB()),
    ("rf_clf", RandomForestClassifier(random_state=RANDOM_SEED)),
    ("sgd_clf", SGDClassifier(random_state=RANDOM_SEED)),
    ("svc_clf", SVC(random_state=RANDOM_SEED)),
]

PARAMETERS = {
    # Count
    "count_vec__min_df": [1.0, 0.05, 0.10, 0.15, 0.20], # Default: 1
    "count_vec__max_df": [1.0, 0.85, 0.65, 0.50],       # Default: 1.0

    # TFIDF
    "tfidf_vec__min_df": [1.0, 0.05, 0.10, 0.15, 0.20], # Default: 1
    "tfidf_vec__max_df": [1.0, 0.85, 0.65, 0.50],       # Default: 1.0
    "tfidf_vec__norm": ["l2", "l1", None],              # Default: l2
    "tfidf_vec__use_idf": [True, False],                # Default: True
    "tfidf_vec__smooth_idf": [True, False],             # Default: True
    "tfidf_vec__sublinear_tf": [False, True],           # Default: False
}

In [None]:
%%time
timestamp = datetime.now().strftime("%-Y%m%d%H%M%S")
results = {}
models = {}

for vec_id, vec in VECTORIZERS:
    clf_pb = tqdm(CLASSIFIERS)

    for clf_id, clf in clf_pb:
        # Create pipeline
        pipeline = Pipeline([(vec_id, vec), (clf_id, clf)])

        # Select parameters
        parameters = {k: v for k, v in PARAMETERS.items() if vec_id in k or clf_id in k}
        n_params = reduce(lambda x, y: x * len(y), parameters.values(), 1)
        n_splits = 5

        # Setup search
        grid_search = GridSearchCV(
            pipeline,
            parameters,
            scoring="f1",
            return_train_score=True,
            cv=n_splits,
            n_jobs=-1,
            verbose=0,
        )

        # Log info
        clf_pb.set_description(
            f"({vec_id}, {clf_id}): np={n_params}, ns={n_splits}, tf={n_params * n_splits}",
        )
        clf_pb.refresh()

        # Train model
        grid_search.fit(X_train, y_train)

        # Evaluate model
        clf = grid_search.best_estimator_
        clf = clf.fit(X_train, y_train)

        y_test_pred = clf.predict(X_test)

        test_score = f1_score(y_test, y_test_pred)

        # Create results
        idx = np.argmax(grid_search.cv_results_["mean_test_score"])

        result = {
            "mean_train_score": grid_search.cv_results_["mean_train_score"][idx],
            "std_train_score": grid_search.cv_results_["std_train_score"][idx],
            "mean_val_score": grid_search.cv_results_["mean_test_score"][idx],
            "std_val_score": grid_search.cv_results_["std_test_score"][idx],
            "test_score": test_score,
            "params": grid_search.best_params_,
        }

        # Store result
        results[vec_id, clf_id] = result
        models[vec_id, clf_id] = clf

        if SAVE_MODEL_CHECKPOINTS:
            with open(MODEL_CHECKPOINTS_FOLDER + f"{timestamp}_{vec_id}_{clf_id}.json", "w") as handle:
                json.dump({f"{k1}_{k2}": v for (k1, k2), v in results.items()}, handle)

(count_vec, ada_clf): np=24, ns=5, tf=120:   0%|                                                                                                      | 0/7 [00:00<?, ?it/s]

In [None]:
best_score = -1.00
best_vec_id = None
best_clf_id = None
best_values = None
best_model = None

for (vec_id, clf_id), values in results.items():
    score = values["test_score"]
    if score > best_score:
        best_score = score
        best_vec_id = vec_id
        best_clf_id = clf_id
        best_values = values
        best_model = models[vec_id, clf_id]

print("-" * 100 + "\n")
print(f"Best Vectorizer: {best_vec_id}")
print(f"Best Classifier: {best_clf_id}" + "\n")
print(f"Best Params:")
pprint(best_values["params"])
print("")
print(f"Mean Train Score: {round(best_values['mean_train_score'], 4)}")
print(f"Std Train Score: {round(best_values['std_train_score'], 4)}")
print(f"Mean Validation Score: {round(best_values['mean_val_score'], 4)}")
print(f"Std Validation Score: {round(best_values['std_val_score'], 4)}" + "\n")
print(f"Test Score: {round(best_values['test_score'], 4)}")
print("\n" + "-" * 100)