# 1. Modeling

Bron: https://iq.opengenus.org/naive-bayes-on-tf-idf-vectorized-matrix/

In [11]:
from pprint import pprint

# Vectorizers
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer

# Classifiers
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn import metrics

from tqdm import tqdm

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
RANDOM_SEED = 42

In [3]:
np.random.seed(RANDOM_SEED)

## 1.1 Load data

In [4]:
df_text = pd.read_csv("../data/aapl_us_equities_news_proc_text.csv")

## 1.2 Split data

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df_text["text"], df_text["target"], stratify=df_text["target"], random_state=RANDOM_SEED)

## 1.3 Build pipeline

Bron: https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html

Bron: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

Bron: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

Bron: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

Bron: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [6]:
VECTORIZERS = [
    ("count", CountVectorizer()),
    ("tfidf", TfidfVectorizer()),
]

CLASSIFIERS = [
    ("ada", AdaBoostClassifier(random_state=RANDOM_SEEDOM_SEED)),
    ("knn", KNeighborsClassifier()),
    ("lr", LogisticRegression(random_state=RANDOM_SEED)),
    ("mlp", MLPClassifier(random_state=RANDOM_SEED)),
    ("nb", MultinomialNB()),
    ("rf", RandomForestClassifier(random_state=RANDOM_SEED)),
    ("sgd", SGDClassifier(random_state=RANDOM_SEED)),
    ("svc", SVC(random_state=RANDOM_SEED)),
]

PARAMETERS = {
    # Count
    "count_vec__ngram_range": [(1, 1)], # Default: (1, 1)
    "count_vec__min_df": [1],           # Default: 1
    "count_vec__max_df": [1.0],         # Default: 1.0
    "count_vec__max_features": [None],  # Default: None

    # TFIDF
    "tfidf_vec__ngram_range": [(1, 1)], # Default: (1, 1)
    "tfidf_vec__min_df": [1],           # Default: 1
    "tfidf_vec__max_df": [1.0],         # Default: 1.0
    "tfidf_vec__max_features": [None],  # Default: None
    "tfidf_vec__norm": ["l2"],          # Default: l2
    "tfidf_vec__use_idf": [True],       # Default: True
    "tfidf_vec__smooth_idf": [True],    # Default: True
    "tfidf_vec__sublinear_tf": [False], # Default: False
}

In [7]:
%%time
results = {}

for vec_id, vec in VECTORIZERS:
    for clf_id, clf in tqdm(CLASSIFIERS):
        # Create pipeline
        pipeline = Pipeline([
            (f"{vec_id}_vec", vec),
            (f"{clf_id}_clf", clf),
        ])

        # Setup search
        grid_parameters = {key: value for key, value in PARAMETERS.items() if vec_id in key or clf_id in key}
        grid_search = GridSearchCV(
            pipeline,
            grid_parameters,
            scoring="f1",
            return_train_score=True,
            cv=5,
            n_jobs=-1,
        )

        # Train model
        grid_search.fit(X_train, y_train)

        # Evaluate model
        test_score = grid_search.score(X_test, y_test)
        
        # Store results
        idx = np.argmax(grid_search.cv_results_["mean_test_score"])
        
        mean_train_score = grid_search.cv_results_["mean_train_score"][idx]
        std_train_score = grid_search.cv_results_["std_train_score"][idx]
        mean_val_score = grid_search.cv_results_["mean_test_score"][idx]
        std_val_score = grid_search.cv_results_["std_test_score"][idx]
        
        results[f"{vec_id}_vec", f"{clf_id}_clf"] = {
            "mean_train_score": mean_train_score,
            "std_train_score": std_train_score,
            "mean_val_score": mean_val_score,
            "std_val_score": std_val_score,
            "test_score": test_score,
            "params": grid_search.best_params_,
        }

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [02:19<00:00, 17.44s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [03:20<00:00, 25.06s/it]

CPU times: user 5min, sys: 14min 21s, total: 19min 22s
Wall time: 5min 39s





In [16]:
best_score = -1.00
best_vec_id = None
best_clf_id = None
best_values = None

for (vec_id, clf_id), values in results.items():
    score = values["test_score"]
    if score > best_score:
        best_score = score
        best_vec_id = vec_id
        best_clf_id = clf_id
        best_values = values

print("-" * 110 + "\n")
print(f"Best Vectorizer: {best_vec_id}")
print(f"Best Classifier: {best_clf_id}" + "\n")
print(f"Best Params:")
pprint(best_values["params"])
print("")
print(f"Mean Train Score: {best_values['mean_train_score']}")
print(f"Std Train Score: {best_values['std_train_score']}")
print(f"Mean Validation Score: {best_values['mean_val_score']}")
print(f"Std Validation Score: {best_values['std_val_score']}" + "\n")
print(f"Test Score: {best_values['test_score']}")
print("\n" + "-" * 110)

--------------------------------------------------------------------------------------------------------------

Best Vectorizer: tfidf_vec
Best Classifier: nb_clf

Best Params:
{'tfidf_vec__max_df': 1.0,
 'tfidf_vec__max_features': None,
 'tfidf_vec__min_df': 1,
 'tfidf_vec__ngram_range': (1, 1),
 'tfidf_vec__norm': 'l2',
 'tfidf_vec__smooth_idf': True,
 'tfidf_vec__sublinear_tf': False,
 'tfidf_vec__use_idf': True}

Mean Train Score: 0.7159525841102482
Std Train Score: 0.0036810131457144166
Mean Validation Score: 0.6861708884303349
Std Validation Score: 0.002898597522594159

Test Score: 0.6869300911854103

--------------------------------------------------------------------------------------------------------------
