In [1]:
import sys

sys.path.append("../Handlers")

#### Evaluate different models

In [2]:
from traintest import ClassificationModel, models

def train_and_evaluate_model(X, y, dataset_name):
    print(f"{dataset_name} classification report")
    print("=========================================")
    for model in models:
        classification_model = ClassificationModel(model, dataset_name)
        classification_model.train(X, y)
        print(f"{model.__class__.__name__} classification report")
        print(classification_model.evaluate())
        classification_model.print_simple_confusion_matrix()
        print("\n")

## Stemming + CountVectorizer

In [3]:
import joblib

enron1_stemmed_countvec = joblib.load("./preprocess/enron1_stemmed_countvec.pkl")
enron2_stemmed_countvec = joblib.load("./preprocess/enron2_stemmed_countvec.pkl")
enron3_stemmed_countvec = joblib.load("./preprocess/enron3_stemmed_countvec.pkl")
enron4_stemmed_countvec = joblib.load("./preprocess/enron4_stemmed_countvec.pkl")
enron5_stemmed_countvec = joblib.load("./preprocess/enron5_stemmed_countvec.pkl")
enron6_stemmed_countvec = joblib.load("./preprocess/enron6_stemmed_countvec.pkl")
enron_stemmed_countvec = joblib.load("./preprocess/enron_stemmed_countvec.pkl")

In [4]:
enron1_stemmed_countvec_X, enron1_y = enron1_stemmed_countvec["features"], enron1_stemmed_countvec["labels"]
enron2_stemmed_countvec_X, enron2_y = enron2_stemmed_countvec["features"], enron2_stemmed_countvec["labels"]
enron3_stemmed_countvec_X, enron3_y = enron3_stemmed_countvec["features"], enron3_stemmed_countvec["labels"]
enron4_stemmed_countvec_X, enron4_y = enron4_stemmed_countvec["features"], enron4_stemmed_countvec["labels"]
enron5_stemmed_countvec_X, enron5_y = enron5_stemmed_countvec["features"], enron5_stemmed_countvec["labels"]
enron6_stemmed_countvec_X, enron6_y = enron6_stemmed_countvec["features"], enron6_stemmed_countvec["labels"]
enron_stemmed_countvec_X, enron_y = enron_stemmed_countvec["features"], enron_stemmed_countvec["labels"]

In [5]:
train_and_evaluate_model(enron1_stemmed_countvec_X, enron1_y, "Enron1")

Enron1 classification report
SVC classification report
              precision    recall  f1-score   support

           0       0.99      0.94      0.96       749
           1       0.86      0.97      0.91       286

    accuracy                           0.95      1035
   macro avg       0.92      0.95      0.94      1035
weighted avg       0.95      0.95      0.95      1035

Confusion Matrix for SVM on dataset Enron1:
Pattern:
True Negative (TN) | False Positive (FP)
False Negative (FN) | True Positive (TP)
[[703  46]
 [  9 277]]




MultinomialNB classification report
              precision    recall  f1-score   support

           0       1.00      0.96      0.98       749
           1       0.90      0.99      0.94       286

    accuracy                           0.97      1035
   macro avg       0.95      0.97      0.96      1035
weighted avg       0.97      0.97      0.97      1035

Confusion Matrix for Multinomial Naive Bayes on dataset Enron1:
Pattern:
True Negative (TN) |

In [6]:
train_and_evaluate_model(enron2_stemmed_countvec_X, enron2_y, "Enron2")

Enron2 classification report
The model has already been trained. This process will overwrite the previous training.
SVC classification report
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       861
           1       0.97      0.94      0.95       311

    accuracy                           0.98      1172
   macro avg       0.97      0.97      0.97      1172
weighted avg       0.98      0.98      0.98      1172

Confusion Matrix for SVM on dataset Enron2:
Pattern:
True Negative (TN) | False Positive (FP)
False Negative (FN) | True Positive (TP)
[[851  10]
 [ 18 293]]




The model has already been trained. This process will overwrite the previous training.
MultinomialNB classification report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       861
           1       0.97      0.99      0.98       311

    accuracy                           0.99      1172
   macro avg       0.98     

In [7]:
train_and_evaluate_model(enron3_stemmed_countvec_X, enron3_y, "Enron3")

Enron3 classification report
The model has already been trained. This process will overwrite the previous training.
SVC classification report
              precision    recall  f1-score   support

           0       0.80      1.00      0.89       796
           1       0.99      0.37      0.54       307

    accuracy                           0.82      1103
   macro avg       0.90      0.68      0.71      1103
weighted avg       0.86      0.82      0.79      1103

Confusion Matrix for SVM on dataset Enron3:
Pattern:
True Negative (TN) | False Positive (FP)
False Negative (FN) | True Positive (TP)
[[795   1]
 [194 113]]




The model has already been trained. This process will overwrite the previous training.
MultinomialNB classification report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       796
           1       0.99      0.97      0.98       307

    accuracy                           0.99      1103
   macro avg       0.99     

In [8]:
train_and_evaluate_model(enron4_stemmed_countvec_X, enron4_y, "Enron4")

Enron4 classification report
The model has already been trained. This process will overwrite the previous training.
SVC classification report
              precision    recall  f1-score   support

           0       1.00      0.87      0.93       322
           1       0.96      1.00      0.98       878

    accuracy                           0.97      1200
   macro avg       0.98      0.94      0.95      1200
weighted avg       0.97      0.97      0.97      1200

Confusion Matrix for SVM on dataset Enron4:
Pattern:
True Negative (TN) | False Positive (FP)
False Negative (FN) | True Positive (TP)
[[281  41]
 [  0 878]]




The model has already been trained. This process will overwrite the previous training.
MultinomialNB classification report
              precision    recall  f1-score   support

           0       0.97      0.96      0.96       322
           1       0.99      0.99      0.99       878

    accuracy                           0.98      1200
   macro avg       0.98     

In [9]:
train_and_evaluate_model(enron5_stemmed_countvec_X, enron5_y, "Enron5")

Enron5 classification report
The model has already been trained. This process will overwrite the previous training.
SVC classification report
              precision    recall  f1-score   support

           0       1.00      0.87      0.93       313
           1       0.95      1.00      0.97       722

    accuracy                           0.96      1035
   macro avg       0.97      0.93      0.95      1035
weighted avg       0.96      0.96      0.96      1035

Confusion Matrix for SVM on dataset Enron5:
Pattern:
True Negative (TN) | False Positive (FP)
False Negative (FN) | True Positive (TP)
[[272  41]
 [  1 721]]




The model has already been trained. This process will overwrite the previous training.
MultinomialNB classification report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       313
           1       1.00      1.00      1.00       722

    accuracy                           1.00      1035
   macro avg       1.00     

In [10]:
train_and_evaluate_model(enron6_stemmed_countvec_X, enron6_y, "Enron6")

Enron6 classification report
The model has already been trained. This process will overwrite the previous training.
SVC classification report
              precision    recall  f1-score   support

           0       1.00      0.82      0.90       330
           1       0.94      1.00      0.97       870

    accuracy                           0.95      1200
   macro avg       0.97      0.91      0.93      1200
weighted avg       0.95      0.95      0.95      1200

Confusion Matrix for SVM on dataset Enron6:
Pattern:
True Negative (TN) | False Positive (FP)
False Negative (FN) | True Positive (TP)
[[271  59]
 [  1 869]]




The model has already been trained. This process will overwrite the previous training.
MultinomialNB classification report
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       330
           1       0.99      0.98      0.99       870

    accuracy                           0.98      1200
   macro avg       0.98     

In [11]:
train_and_evaluate_model(enron_stemmed_countvec_X, enron_y, "Merged Enron")

Merged Enron classification report
The model has already been trained. This process will overwrite the previous training.
SVC classification report
              precision    recall  f1-score   support

           0       0.99      0.93      0.96      3282
           1       0.94      0.99      0.96      3461

    accuracy                           0.96      6743
   macro avg       0.96      0.96      0.96      6743
weighted avg       0.96      0.96      0.96      6743

Confusion Matrix for SVM on dataset Merged Enron:
Pattern:
True Negative (TN) | False Positive (FP)
False Negative (FN) | True Positive (TP)
[[3061  221]
 [  45 3416]]




The model has already been trained. This process will overwrite the previous training.
MultinomialNB classification report
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      3282
           1       0.98      0.99      0.98      3461

    accuracy                           0.98      6743
   macro avg

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression classification report
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      3282
           1       0.98      0.99      0.99      3461

    accuracy                           0.99      6743
   macro avg       0.99      0.99      0.99      6743
weighted avg       0.99      0.99      0.99      6743

Confusion Matrix for Logistic Regression on dataset Merged Enron:
Pattern:
True Negative (TN) | False Positive (FP)
False Negative (FN) | True Positive (TP)
[[3215   67]
 [  23 3438]]




The model has already been trained. This process will overwrite the previous training.
KNeighborsClassifier classification report
              precision    recall  f1-score   support

           0       0.94      0.82      0.88      3282
           1       0.85      0.95      0.90      3461

    accuracy                           0.89      6743
   macro avg       0.90      0.89      0.89      6743
weighted avg       0.89      0.89      0.8

## Stemming + TF-IDF Vectorizer

In [12]:
enron1_stemmed_tfidf = joblib.load("./preprocess/enron1_stemmed_tfidf.pkl")
enron2_stemmed_tfidf = joblib.load("./preprocess/enron2_stemmed_tfidf.pkl")
enron3_stemmed_tfidf = joblib.load("./preprocess/enron3_stemmed_tfidf.pkl")
enron4_stemmed_tfidf = joblib.load("./preprocess/enron4_stemmed_tfidf.pkl")
enron5_stemmed_tfidf = joblib.load("./preprocess/enron5_stemmed_tfidf.pkl")
enron6_stemmed_tfidf = joblib.load("./preprocess/enron6_stemmed_tfidf.pkl")
enron_stemmed_tfidf = joblib.load("./preprocess/enron_stemmed_tfidf.pkl")

In [13]:
enron1_stemmed_tfidf_X, enron1_y = enron1_stemmed_tfidf["features"], enron1_stemmed_tfidf["labels"]
enron2_stemmed_tfidf_X, enron2_y = enron2_stemmed_tfidf["features"], enron2_stemmed_tfidf["labels"]
enron3_stemmed_tfidf_X, enron3_y = enron3_stemmed_tfidf["features"], enron3_stemmed_tfidf["labels"]
enron4_stemmed_tfidf_X, enron4_y = enron4_stemmed_tfidf["features"], enron4_stemmed_tfidf["labels"]
enron5_stemmed_tfidf_X, enron5_y = enron5_stemmed_tfidf["features"], enron5_stemmed_tfidf["labels"]
enron6_stemmed_tfidf_X, enron6_y = enron6_stemmed_tfidf["features"], enron6_stemmed_tfidf["labels"]
enron_stemmed_tfidf_X, enron_y = enron_stemmed_tfidf["features"], enron_stemmed_tfidf["labels"]

In [14]:
train_and_evaluate_model(enron1_stemmed_tfidf_X, enron1_y, "Enron1")

Enron1 classification report
The model has already been trained. This process will overwrite the previous training.
SVC classification report
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       749
           1       0.93      0.99      0.96       286

    accuracy                           0.98      1035
   macro avg       0.96      0.98      0.97      1035
weighted avg       0.98      0.98      0.98      1035

Confusion Matrix for SVM on dataset Enron1:
Pattern:
True Negative (TN) | False Positive (FP)
False Negative (FN) | True Positive (TP)
[[729  20]
 [  3 283]]




The model has already been trained. This process will overwrite the previous training.
MultinomialNB classification report
              precision    recall  f1-score   support

           0       0.98      0.97      0.97       749
           1       0.93      0.94      0.93       286

    accuracy                           0.96      1035
   macro avg       0.95     

In [15]:
train_and_evaluate_model(enron2_stemmed_tfidf_X, enron2_y, "Enron2")

Enron2 classification report
The model has already been trained. This process will overwrite the previous training.
SVC classification report
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       861
           1       0.99      0.98      0.99       311

    accuracy                           0.99      1172
   macro avg       0.99      0.99      0.99      1172
weighted avg       0.99      0.99      0.99      1172

Confusion Matrix for SVM on dataset Enron2:
Pattern:
True Negative (TN) | False Positive (FP)
False Negative (FN) | True Positive (TP)
[[859   2]
 [  6 305]]




The model has already been trained. This process will overwrite the previous training.
MultinomialNB classification report
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       861
           1       1.00      0.93      0.96       311

    accuracy                           0.98      1172
   macro avg       0.99     

In [16]:
train_and_evaluate_model(enron3_stemmed_tfidf_X, enron3_y, "Enron3")

Enron3 classification report
The model has already been trained. This process will overwrite the previous training.
SVC classification report
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       796
           1       1.00      0.97      0.99       307

    accuracy                           0.99      1103
   macro avg       0.99      0.99      0.99      1103
weighted avg       0.99      0.99      0.99      1103

Confusion Matrix for SVM on dataset Enron3:
Pattern:
True Negative (TN) | False Positive (FP)
False Negative (FN) | True Positive (TP)
[[795   1]
 [  8 299]]




The model has already been trained. This process will overwrite the previous training.
MultinomialNB classification report
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       796
           1       1.00      0.86      0.92       307

    accuracy                           0.96      1103
   macro avg       0.97     

In [17]:
train_and_evaluate_model(enron4_stemmed_tfidf_X, enron4_y, "Enron4")

Enron4 classification report
The model has already been trained. This process will overwrite the previous training.
SVC classification report
              precision    recall  f1-score   support

           0       1.00      0.94      0.97       322
           1       0.98      1.00      0.99       878

    accuracy                           0.98      1200
   macro avg       0.99      0.97      0.98      1200
weighted avg       0.98      0.98      0.98      1200

Confusion Matrix for SVM on dataset Enron4:
Pattern:
True Negative (TN) | False Positive (FP)
False Negative (FN) | True Positive (TP)
[[302  20]
 [  0 878]]




The model has already been trained. This process will overwrite the previous training.
MultinomialNB classification report
              precision    recall  f1-score   support

           0       0.88      0.83      0.85       322
           1       0.94      0.96      0.95       878

    accuracy                           0.92      1200
   macro avg       0.91     

In [18]:
train_and_evaluate_model(enron5_stemmed_tfidf_X, enron5_y, "Enron5")

Enron5 classification report
The model has already been trained. This process will overwrite the previous training.
SVC classification report
              precision    recall  f1-score   support

           0       0.99      0.96      0.98       313
           1       0.98      1.00      0.99       722

    accuracy                           0.99      1035
   macro avg       0.99      0.98      0.98      1035
weighted avg       0.99      0.99      0.99      1035

Confusion Matrix for SVM on dataset Enron5:
Pattern:
True Negative (TN) | False Positive (FP)
False Negative (FN) | True Positive (TP)
[[301  12]
 [  2 720]]




The model has already been trained. This process will overwrite the previous training.
MultinomialNB classification report
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       313
           1       0.99      1.00      0.99       722

    accuracy                           0.99      1035
   macro avg       0.99     

In [19]:
train_and_evaluate_model(enron6_stemmed_tfidf_X, enron6_y, "Enron6")

Enron6 classification report
The model has already been trained. This process will overwrite the previous training.
SVC classification report
              precision    recall  f1-score   support

           0       1.00      0.93      0.96       330
           1       0.97      1.00      0.99       870

    accuracy                           0.98      1200
   macro avg       0.98      0.96      0.97      1200
weighted avg       0.98      0.98      0.98      1200

Confusion Matrix for SVM on dataset Enron6:
Pattern:
True Negative (TN) | False Positive (FP)
False Negative (FN) | True Positive (TP)
[[306  24]
 [  1 869]]




The model has already been trained. This process will overwrite the previous training.
MultinomialNB classification report
              precision    recall  f1-score   support

           0       0.96      0.88      0.92       330
           1       0.96      0.99      0.97       870

    accuracy                           0.96      1200
   macro avg       0.96     

In [20]:
train_and_evaluate_model(enron_stemmed_tfidf_X, enron_y, "Merged_enron")

Merged_enron classification report
The model has already been trained. This process will overwrite the previous training.
SVC classification report
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      3282
           1       0.98      1.00      0.99      3461

    accuracy                           0.99      6743
   macro avg       0.99      0.99      0.99      6743
weighted avg       0.99      0.99      0.99      6743

Confusion Matrix for SVM on dataset Merged_enron:
Pattern:
True Negative (TN) | False Positive (FP)
False Negative (FN) | True Positive (TP)
[[3218   64]
 [  15 3446]]




The model has already been trained. This process will overwrite the previous training.
MultinomialNB classification report
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      3282
           1       0.98      0.99      0.98      3461

    accuracy                           0.98      6743
   macro avg