In [1]:
import sys

sys.path.append("../Handlers")

In [2]:
from traintest import ClassificationModel, models, add_to_json_array
from functools import partial
from concurrent.futures import ThreadPoolExecutor

metric_results = []

def t_and_e(model, X, y, dataset_name, mode, n_jobs, cv):
    print(f"Begin {model.__class__.__name__}")
    classification_model = ClassificationModel(model, dataset_name)
    classification_model.train_with_finding_best_parameters(X, y, save_model=True, mode=mode, n_jobs=n_jobs, cv=cv)
    classification_model.get_best_estimator(put_right_in_the_model=True)
    print(f"{model.__class__.__name__} classification report")
    metrics = classification_model.evaluate(detailed=True)
    metric_results.append(metrics)
    print(metrics)
    print("\n")

def train_and_evaluate_model(X, y, dataset_name, m=None, mode="grid"):
    print(f"{dataset_name} classification report")
    print("=========================================")

    train_eval = partial(
        t_and_e,
        X=X,
        y=y,
        dataset_name=dataset_name,
        mode=mode,
        n_jobs=6,
        cv=4
    )

    if m is None:
        with ThreadPoolExecutor(max_workers=3) as executor:
            executor.map(train_eval, models)
            # future = [executor.submit(train_eval, model) for model in models]
        # for model in models:
        #     train_eval(model)
    else:
        t_and_e(X, y, m, dataset_name, mode, n_jobs=6, cv=4)

In [3]:
chosen_enron = "enron1"

## Stemming + CountVectorizer

In [4]:
import joblib

enron1_stemmed_countvec = joblib.load("./preprocess/enron1_stemmed_countvec.pkl")
enron2_stemmed_countvec = joblib.load("./preprocess/enron2_stemmed_countvec.pkl")
enron3_stemmed_countvec = joblib.load("./preprocess/enron3_stemmed_countvec.pkl")
enron4_stemmed_countvec = joblib.load("./preprocess/enron4_stemmed_countvec.pkl")
enron5_stemmed_countvec = joblib.load("./preprocess/enron5_stemmed_countvec.pkl")
enron6_stemmed_countvec = joblib.load("./preprocess/enron6_stemmed_countvec.pkl")

In [5]:
enron1_stemmed_countvec_X, enron1_y = enron1_stemmed_countvec["features"], enron1_stemmed_countvec["labels"]
enron2_stemmed_countvec_X, enron2_y = enron2_stemmed_countvec["features"], enron2_stemmed_countvec["labels"]
enron3_stemmed_countvec_X, enron3_y = enron3_stemmed_countvec["features"], enron3_stemmed_countvec["labels"]
enron4_stemmed_countvec_X, enron4_y = enron4_stemmed_countvec["features"], enron4_stemmed_countvec["labels"]
enron5_stemmed_countvec_X, enron5_y = enron5_stemmed_countvec["features"], enron5_stemmed_countvec["labels"]
enron6_stemmed_countvec_X, enron6_y = enron6_stemmed_countvec["features"], enron6_stemmed_countvec["labels"]

In [6]:
train_and_evaluate_model(enron1_stemmed_countvec_X, enron1_y, "enron1_stemmed_countvec")

enron1_stemmed_countvec_X = None
enron1_y = None

enron1_stemmed_countvec classification report
Begin SVC
Begin MultinomialNB
Begin BernoulliNB
The best estimator for Bernoulli Naive Bayes of dataset enron1_stemmed_countvec is: 
The best estimator for Multinomial Naive Bayes of dataset enron1_stemmed_countvec is: 
BernoulliNB(alpha=0.1, fit_prior=False)
BernoulliNB classification report
MultinomialNB(alpha=0.1)
MultinomialNB classification report
{'dataset': 'enron1_stemmed_countvec', 'model': 'Multinomial Naive Bayes', 'type': 'grid_search', 'metrics': {'accuracy': 0.9729468599033816, 'weighted_precision': 0.9748277259219852, 'wighted_recall': 0.9729468599033816, 'weighted_f1': 0.9732713113390702, 'macro_precision': 0.9566852057842047, 'macro_recall': 0.9791470212030959, 'macro_f1': 0.9670121023922487, 'roc_auc': 0.979147021203096}, 'confusion_matrix': array([[723,  26],
       [  2, 284]], dtype=int64), 'best_parameters': {'alpha': 0.1, 'fit_prior': True}, 'best_score': 0.9649503826423345}


Begin RandomForestClassifier
{'dataset': 

60 fits failed out of a total of 144.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "e:\Python Tests\AI\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\Python Tests\AI\.venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "e:\Python Tests\AI\.venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

--------------------

The best estimator for Logistic Regression of dataset enron1_stemmed_countvec is: 
LogisticRegression(C=1, solver='liblinear')
LogisticRegression classification report
{'dataset': 'enron1_stemmed_countvec', 'model': 'Logistic Regression', 'type': 'grid_search', 'metrics': {'accuracy': 0.970048309178744, 'weighted_precision': 0.9709977697232596, 'wighted_recall': 0.970048309178744, 'weighted_f1': 0.9702795217081589, 'macro_precision': 0.9563444286529008, 'macro_recall': 0.9706601809405547, 'macro_f1': 0.9631427616335504, 'roc_auc': 0.9706601809405547}, 'confusion_matrix': array([[726,  23],
       [  8, 278]], dtype=int64), 'best_parameters': {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}, 'best_score': 0.9746175912688401}


Begin SGDClassifier
The best estimator for K-nearest Neighbors of dataset enron1_stemmed_countvec is: 
KNeighborsClassifier(weights='distance')
KNeighborsClassifier classification report
{'dataset': 'enron1_stemmed_countvec', 'model': 'K-nearest Neighbors', 'type'

In [None]:
train_and_evaluate_model(enron2_stemmed_countvec_X, enron2_y, "enron2_stemmed_countvec")
enron2_stemmed_countvec_X = None
enron2_y = None

enron2_stemmed_countvec classification report
Begin SVC
Begin MultinomialNB
Begin BernoulliNB
The best estimator for Multinomial Naive Bayes of dataset enron2_stemmed_countvec is: 
MultinomialNB(alpha=0.1)
MultinomialNB classification report
{'dataset': 'enron2_stemmed_countvec', 'model': 'Multinomial Naive Bayes', 'type': 'grid_search', 'metrics': {'accuracy': 0.9906143344709898, 'weighted_precision': 0.9907073881518728, 'wighted_recall': 0.9906143344709898, 'weighted_f1': 0.9906380915884042, 'macro_precision': 0.9855894357033006, 'macro_recall': 0.9905310881312763, 'macro_f1': 0.9880248086728178, 'roc_auc': 0.9905310881312762}, 'confusion_matrix': array([[853,   8],
       [  3, 308]], dtype=int64), 'best_parameters': {'alpha': 0.1, 'fit_prior': True}, 'best_score': 0.9878329175203947}


Begin RandomForestClassifier
The best estimator for Bernoulli Naive Bayes of dataset enron2_stemmed_countvec is: 
BernoulliNB()
BernoulliNB classification report
{'dataset': 'enron2_stemmed_countvec'

60 fits failed out of a total of 144.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "e:\Python Tests\AI\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\Python Tests\AI\.venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "e:\Python Tests\AI\.venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

--------------------

The best estimator for Logistic Regression of dataset enron2_stemmed_countvec is: 
LogisticRegression(C=0.1, solver='liblinear')
LogisticRegression classification report
{'dataset': 'enron2_stemmed_countvec', 'model': 'Logistic Regression', 'type': 'grid_search', 'metrics': {'accuracy': 0.9880546075085325, 'weighted_precision': 0.9880863041626414, 'wighted_recall': 0.9880546075085325, 'weighted_f1': 0.9880668057700956, 'macro_precision': 0.9837280142226454, 'macro_recall': 0.9857079370058743, 'macro_f1': 0.9847122838401908, 'roc_auc': 0.9857079370058744}, 'confusion_matrix': array([[853,   8],
       [  6, 305]], dtype=int64), 'best_parameters': {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}, 'best_score': 0.9891144204510015}


Begin SGDClassifier
The best estimator for Random Forest of dataset enron2_stemmed_countvec is: 
RandomForestClassifier(min_samples_split=10)
RandomForestClassifier classification report
The best estimator for K-nearest Neighbors of dataset enron2_stemmed_co

In [None]:
train_and_evaluate_model(enron3_stemmed_countvec_X, enron3_y, "enron3_stemmed_countvec")
enron3_stemmed_countvec_X = None
enron3_y = None

In [None]:
train_and_evaluate_model(enron4_stemmed_countvec_X, enron4_y, "enron4_stemmed_countvec")
enron4_stemmed_countvec_X = None
enron4_y = None

In [None]:
train_and_evaluate_model(enron5_stemmed_countvec_X, enron5_y, "enron5_stemmed_countvec")
enron5_stemmed_countvec_X = None
enron5_y = None

In [None]:
train_and_evaluate_model(enron6_stemmed_countvec_X, enron6_y, "enron6_stemmed_countvec")
enron6_stemmed_countvec_X = None
enron6_y = None

## Stemming + TF-IDF Vectorizer

In [None]:
enron1_stemmed_tfidf = joblib.load("./preprocess/enron1_stemmed_tfidf.pkl")
enron2_stemmed_tfidf = joblib.load("./preprocess/enron2_stemmed_tfidf.pkl")
enron3_stemmed_tfidf = joblib.load("./preprocess/enron3_stemmed_tfidf.pkl")
enron4_stemmed_tfidf = joblib.load("./preprocess/enron4_stemmed_tfidf.pkl")
enron5_stemmed_tfidf = joblib.load("./preprocess/enron5_stemmed_tfidf.pkl")
enron6_stemmed_tfidf = joblib.load("./preprocess/enron6_stemmed_tfidf.pkl")

In [None]:
enron1_stemmed_tfidf_X, enron1_y = enron1_stemmed_tfidf["features"], enron1_stemmed_tfidf["labels"]
enron2_stemmed_tfidf_X, enron2_y = enron2_stemmed_tfidf["features"], enron2_stemmed_tfidf["labels"]
enron3_stemmed_tfidf_X, enron3_y = enron3_stemmed_tfidf["features"], enron3_stemmed_tfidf["labels"]
enron4_stemmed_tfidf_X, enron4_y = enron4_stemmed_tfidf["features"], enron4_stemmed_tfidf["labels"]
enron5_stemmed_tfidf_X, enron5_y = enron5_stemmed_tfidf["features"], enron5_stemmed_tfidf["labels"]
enron6_stemmed_tfidf_X, enron6_y = enron6_stemmed_tfidf["features"], enron6_stemmed_tfidf["labels"]

In [None]:
train_and_evaluate_model(enron1_stemmed_tfidf_X, enron1_y, "enron1_stemmed_tfidf")
enron1_stemmed_tfidf_X = None
enron1_y = None

enron1_stemmed_tfidf classification report
Executing
Executing
The best estimator for Multinomial Naive Bayes of dataset enron1_stemmed_tfidf is: 
MultinomialNB(alpha=0.5)
MultinomialNB classification report
{'dataset': 'enron1_stemmed_tfidf', 'model': 'Multinomial Naive Bayes', 'type': 'grid_search', 'metrics': {'accuracy': 0.9729468599033816, 'weighted_precision': 0.9737648953301128, 'wighted_recall': 0.9729468599033816, 'weighted_f1': 0.9731424111285633, 'macro_precision': 0.9602380952380953, 'macro_recall': 0.9737435461734527, 'macro_f1': 0.9666752527529139, 'roc_auc': 0.9737435461734526}, 'confusion_matrix': array([[728,  21],
       [  7, 279]], dtype=int64), 'best_parameters': {'alpha': 0.5, 'fit_prior': True}, 'best_score': 0.9714767938403461}


Executing
The best estimator for Bernoulli Naive Bayes of dataset enron1_stemmed_tfidf is: 
BernoulliNB(alpha=0.1, fit_prior=False)
BernoulliNB classification report
{'dataset': 'enron1_stemmed_tfidf', 'model': 'Bernoulli Naive Bayes', 

60 fits failed out of a total of 144.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "e:\Python Tests\AI\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\Python Tests\AI\.venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "e:\Python Tests\AI\.venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

--------------------

The best estimator for Logistic Regression of dataset enron1_stemmed_tfidf is: 
LogisticRegression(C=0.1, penalty=None, solver='sag')
LogisticRegression classification report
{'dataset': 'enron1_stemmed_tfidf', 'model': 'Logistic Regression', 'type': 'grid_search', 'metrics': {'accuracy': 0.9806763285024155, 'weighted_precision': 0.9811005775440675, 'wighted_recall': 0.9806763285024155, 'weighted_f1': 0.9807774077340742, 'macro_precision': 0.9712792122298212, 'macro_recall': 0.9812453901238948, 'macro_f1': 0.9760974393082806, 'roc_auc': 0.9812453901238948}, 'confusion_matrix': array([[734,  15],
       [  5, 281]], dtype=int64), 'best_parameters': {'C': 0.1, 'penalty': None, 'solver': 'sag'}, 'best_score': 0.9847706949233315}


Executing
The best estimator for AdaBoost of dataset enron1_stemmed_tfidf is: 
AdaBoostClassifier(learning_rate=0.1, n_estimators=200)
AdaBoostClassifier classification report
{'dataset': 'enron1_stemmed_tfidf', 'model': 'AdaBoost', 'type': 'grid_search', 'metri

208 fits failed out of a total of 280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
208 fits failed with the following error:
Traceback (most recent call last):
  File "e:\Python Tests\AI\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\Python Tests\AI\.venv\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py", line 892, in fit
    self._more_validate_params()
  File "e:\Python Tests\AI\.venv\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py", line 149, in _more_validate_params
    raise ValueError("eta0 must be > 0")
ValueError: eta0 must be > 0

 0.92288752 0.98428737        nan 0.79187621        nan 0.971

The best estimator for Stochastic Gradient Descent of dataset enron1_stemmed_tfidf is: 
SGDClassifier(alpha=0.001, loss='squared_hinge')
SGDClassifier classification report
{'dataset': 'enron1_stemmed_tfidf', 'model': 'Stochastic Gradient Descent', 'type': 'grid_search', 'metrics': {'accuracy': 0.9797101449275363, 'weighted_precision': 0.9806952256166457, 'wighted_recall': 0.9797101449275363, 'weighted_f1': 0.9798865017932334, 'macro_precision': 0.9672807444678895, 'macro_recall': 0.9838199183993577, 'macro_f1': 0.9750833706863049, 'roc_auc': 0.9838199183993576}, 'confusion_matrix': array([[730,  19],
       [  2, 284]], dtype=int64), 'best_parameters': {'penalty': 'l2', 'loss': 'squared_hinge', 'learning_rate': 'optimal', 'alpha': 0.001}, 'best_score': 0.9842873695325128}


Executing
The best estimator for Perceptron of dataset enron1_stemmed_tfidf is: 
Perceptron(penalty='elasticnet', tol=1e-05)
Perceptron classification report
{'dataset': 'enron1_stemmed_tfidf', 'model': 'Perceptron

In [None]:
train_and_evaluate_model(enron2_stemmed_tfidf_X, enron2_y, "enron2_stemmed_tfidf")
enron2_stemmed_tfidf_X = None

In [None]:
train_and_evaluate_model(enron3_stemmed_tfidf_X, enron3_y, "enron3_stemmed_tfidf")

In [None]:
train_and_evaluate_model(enron4_stemmed_tfidf_X, enron4_y, "enron4_stemmed_tfidf")

In [None]:
train_and_evaluate_model(enron5_stemmed_tfidf_X, enron5_y, "enron5_stemmed_tfidf")

In [None]:
train_and_evaluate_model(enron6_stemmed_tfidf_X, enron6_y, "enron6_stemmed_tfidf")

## Lemmatizing + CountVectorizer

In [None]:
enron1_lemmatized_countvec = joblib.load("./preprocess/enron1_lemmatized_countvec.pkl")
enron2_lemmatized_countvec = joblib.load("./preprocess/enron2_lemmatized_countvec.pkl")
enron3_lemmatized_countvec = joblib.load("./preprocess/enron3_lemmatized_countvec.pkl")
enron4_lemmatized_countvec = joblib.load("./preprocess/enron4_lemmatized_countvec.pkl")
enron5_lemmatized_countvec = joblib.load("./preprocess/enron5_lemmatized_countvec.pkl")
enron6_lemmatized_countvec = joblib.load("./preprocess/enron6_lemmatized_countvec.pkl")

In [None]:
enron1_lemmatized_countvec_X, enron1_y = enron1_lemmatized_countvec["features"], enron1_lemmatized_countvec["labels"]
enron2_lemmatized_countvec_X, enron2_y = enron2_lemmatized_countvec["features"], enron2_lemmatized_countvec["labels"]
enron3_lemmatized_countvec_X, enron3_y = enron3_lemmatized_countvec["features"], enron3_lemmatized_countvec["labels"]
enron4_lemmatized_countvec_X, enron4_y = enron4_lemmatized_countvec["features"], enron4_lemmatized_countvec["labels"]
enron5_lemmatized_countvec_X, enron5_y = enron5_lemmatized_countvec["features"], enron5_lemmatized_countvec["labels"]
enron6_lemmatized_countvec_X, enron6_y = enron6_lemmatized_countvec["features"], enron6_lemmatized_countvec["labels"]

In [None]:
train_and_evaluate_model(enron1_lemmatized_countvec_X, enron1_y, "enron1_lemmatized_countvec")

enron1_lemmatized_countvec classification report


In [None]:
train_and_evaluate_model(enron2_lemmatized_countvec_X, enron2_y, "enron2_lemmatized_countvec")

In [None]:
train_and_evaluate_model(enron3_lemmatized_countvec_X, enron3_y, "enron3_lemmatized_countvec")

In [None]:
train_and_evaluate_model(enron4_lemmatized_countvec_X, enron4_y, "enron4_lemmatized_countvec")

In [None]:
train_and_evaluate_model(enron5_lemmatized_countvec_X, enron5_y, "enron5_lemmatized_countvec")

In [None]:
train_and_evaluate_model(enron6_lemmatized_countvec_X, enron6_y, "enron6_lemmatized_countvec")

## Lemmatizing + TF-IDF Vectorizer

In [None]:
enron1_lemmatized_tfidf = joblib.load("./preprocess/enron1_lemmatized_tfidf.pkl")
enron2_lemmatized_tfidf = joblib.load("./preprocess/enron2_lemmatized_tfidf.pkl")
enron3_lemmatized_tfidf = joblib.load("./preprocess/enron3_lemmatized_tfidf.pkl")
enron4_lemmatized_tfidf = joblib.load("./preprocess/enron4_lemmatized_tfidf.pkl")
enron5_lemmatized_tfidf = joblib.load("./preprocess/enron5_lemmatized_tfidf.pkl")
enron6_lemmatized_tfidf = joblib.load("./preprocess/enron6_lemmatized_tfidf.pkl")

In [None]:
enron1_lemmatized_tfidf_X, enron1_y = enron1_lemmatized_tfidf["features"], enron1_lemmatized_tfidf["labels"]
enron2_lemmatized_tfidf_X, enron2_y = enron2_lemmatized_tfidf["features"], enron2_lemmatized_tfidf["labels"]
enron3_lemmatized_tfidf_X, enron3_y = enron3_lemmatized_tfidf["features"], enron3_lemmatized_tfidf["labels"]
enron4_lemmatized_tfidf_X, enron4_y = enron4_lemmatized_tfidf["features"], enron4_lemmatized_tfidf["labels"]
enron5_lemmatized_tfidf_X, enron5_y = enron5_lemmatized_tfidf["features"], enron5_lemmatized_tfidf["labels"]
enron6_lemmatized_tfidf_X, enron6_y = enron6_lemmatized_tfidf["features"], enron6_lemmatized_tfidf["labels"]

In [None]:
train_and_evaluate_model(enron1_lemmatized_tfidf_X, enron1_y, "enron1_lemmatized_tfidf")

enron1_lemmatized_tfidf classification report


In [None]:
train_and_evaluate_model(enron2_lemmatized_tfidf_X, enron2_y, "enron2_lemmatized_tfidf")

In [None]:
train_and_evaluate_model(enron3_lemmatized_tfidf_X, enron3_y, "enron3_lemmatized_tfidf")

In [None]:
train_and_evaluate_model(enron4_lemmatized_tfidf_X, enron4_y, "enron4_lemmatized_tfidf")

In [None]:
train_and_evaluate_model(enron5_lemmatized_tfidf_X, enron5_y, "enron5_lemmatized_tfidf")

In [None]:
train_and_evaluate_model(enron6_lemmatized_tfidf_X, enron6_y, "enron6_lemmatized_tfidf")

In [None]:
print(metric_results)

def convert(dic):
    dic["confusion_matrix"] = dic["confusion_matrix"].tolist()
    return dic

metric_results = list(map(convert, metric_results))

add_to_json_array("./enron_classification_grid_report.json", metric_results, mode="overwrite")