In [1]:
# load packages
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

# provides a simple way to both tokenize a collection of text documents and build a vocabulary of known words, but also to encode new documents using that vocabulary.
from sklearn.feature_extraction.text import CountVectorizer
# systematically compute word counts using CountVectorizer and them compute the Inverse Document Frequency (IDF) values and only then compute the Tf-idf scores.
from sklearn.feature_extraction.text import TfidfTransformer

# Bernoulli Naive Bayes (Similar as  MultinomialNB), this classifier is suitable for discrete data. The difference between MultinomialNB and BernoulliNB is that while  MultinomialNB works with occurrence counts, BernoulliNB is designed for binary/boolen features, which means in the case of text classification, word occurrence vectores (rather than word count vectors) may be more suitable to be used to train and use this classifier.
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

# Evaluation metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix

# joblib is a set of tools to provide lightweight pipelining in Python. It provides utilities for saving and loading Python objects that make use of NumPy data structures, efficiently.
import joblib


In [2]:

# --------------------------      Dataset Import      --------------------------------------

presence = pd.read_csv('final_presence.csv')

print(presence['classification'].value_counts())

0    1603
1    1202
Name: classification, dtype: int64


In [3]:
# --------------------- Training Preparation ------------------

# split the dataset into train and test dataset as a ratio of 60%/40% (train/test).

string_train, string_test, dark_train, dark_test = train_test_split(
    presence['Pattern String'], presence["classification"], train_size = .6)

encoder = LabelEncoder()
encoder.fit(dark_train)
y_train = encoder.transform(dark_train)
y_test = encoder.transform(dark_test)

# check the mapping of encoding results (from 0 to 1 representing 'Dark', 'Not Dark')

integer_mapping = {label: encoding for encoding, label in enumerate(encoder.classes_)}
print(integer_mapping)

# Check the frequency distribution of the training pattern classification with pattern classification names.

(unique, counts) = np.unique(dark_train, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print(frequencies)


# Check the frequency distribution of the encoded training pattern classification with encoded integers.

(unique, counts) = np.unique(y_train, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print(frequencies)


# Check the frequency distribution of the encoded testing pattern classification with encoded integers.

(unique, counts) = np.unique(y_test, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print(frequencies)


{0: 0, 1: 1}
[[  0 949]
 [  1 734]]
[[  0 949]
 [  1 734]]
[[  0 654]
 [  1 468]]


In [4]:
 # First get the word count vector of the pattern string to encode the pattern string.

cv = CountVectorizer()
string_train_counts = cv.fit_transform(string_train)

# Then use the tf-idf score to transform the encoded word count pattern string vectors.

tfidf_tf = TfidfTransformer()
X_train = tfidf_tf.fit_transform(string_train_counts)

# save the CountVectorizer to disk

joblib.dump(cv, 'presence_CountVectorizer.joblib')


['presence_CountVectorizer.joblib']

In [5]:

classifiers = [LogisticRegression(),LinearSVC(), RandomForestClassifier(), MultinomialNB(), BernoulliNB()]


# Calculate the accuracies of different classifiers using default settings.

acc = []
cm = []

for clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(cv.transform(string_test))
    acc.append(metrics.accuracy_score(y_test, y_pred))
    cm.append(metrics.confusion_matrix(y_test, y_pred))

 # List the accuracies of different classifiers.

for i in range(len(classifiers)):
    print(f"{classifiers[i]} accuracy: {acc[i]}")
    # print(f"Confusion Matris: {cm[i]}")

LogisticRegression() accuracy: 0.8556149732620321
LinearSVC() accuracy: 0.857397504456328
RandomForestClassifier() accuracy: 0.8618538324420677
MultinomialNB() accuracy: 0.8342245989304813
BernoulliNB() accuracy: 0.8458110516934046


In [6]:
# ---------------- Bernoulli Naive Bayes Classifier ------------------

clf_bnb = BernoulliNB().fit(X_train, y_train)

y_pred = clf_bnb.predict(cv.transform(string_test))

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred))

(unique, counts) = np.unique(y_pred, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)


# Parameter tunning

param_grid = {'alpha': [0, 1],
              'fit_prior': [True, False]}

gs = GridSearchCV(clf_bnb,param_grid,cv=5,
                      verbose = 1, n_jobs = -1)

best_bnb = gs.fit(X_train, y_train)

scores_df = pd.DataFrame(best_bnb.cv_results_)
scores_df = scores_df.sort_values(by=['rank_test_score']).reset_index(drop='index')
print(scores_df[['rank_test_score', 'mean_test_score', 'param_alpha', 'param_fit_prior']])

print(best_bnb.best_params_)

y_pred_best = best_bnb.predict(cv.transform(string_test))

(unique, counts) = np.unique(y_pred_best, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

 # save the model to local disk

joblib.dump(best_bnb, 'bnb_presence_classifier.joblib')

Accuracy: 0.8458110516934046
Confusion Matrix:
 [[613  41]
 [132 336]]
[[  0 745]
 [  1 377]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


   rank_test_score  mean_test_score param_alpha param_fit_prior
0                1         0.858577           0           False
1                2         0.857385           0            True
2                3         0.850869           1           False
3                4         0.847289           1            True
{'alpha': 0, 'fit_prior': False}
[[  0 658]
 [  1 464]]


[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed:    1.8s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.8s finished


['bnb_presence_classifier.joblib']

In [7]:
# ------------------- Random Forest Classifier -------------
clf_rf = RandomForestClassifier().fit(X_train, y_train)

y_pred = clf_rf.predict(cv.transform(string_test))

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred))

(unique, counts) = np.unique(y_pred, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

# Parameter tunning
param_grid = {'bootstrap':[True,False],
              'criterion':['gini','entropy'],
              'max_depth':[10,20,30,40,None],
              'min_samples_leaf':[1,2,4],
              'min_samples_split':[2,5,10],
              'n_estimators':[100,200,300]}

gs = GridSearchCV(clf_rf, param_grid, cv=5,
                  verbose=1, n_jobs=-1)

best_rf = gs.fit(X_train,y_train)

scores_df = pd.DataFrame(best_rf.cv_results_)
scores_df = scores_df.sort_values(by=['rank_test_score']).reset_index(drop='index')
print(scores_df [['rank_test_score', 'mean_test_score', 'param_bootstrap', 'param_criterion','param_max_depth','param_min_samples_leaf','param_min_samples_split','param_n_estimators']])

print(best_rf.best_params_)

y_pred_best = best_rf.predict(cv.transform(string_test))

print("Accuracy:", metrics.accuracy_score(y_test, y_pred_best))
print("Confusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred_best))

(unique, counts) = np.unique(y_pred_best, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

# save the model to local disk

joblib.dump(best_rf, 'rf_presence_classifier.joblib')

Accuracy: 0.8672014260249554
Confusion Matrix:
 [[586  68]
 [ 81 387]]
[[  0 667]
 [  1 455]]
Fitting 5 folds for each of 540 candidates, totalling 2700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   42.6s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 2700 out of 2700 | elapsed:  6.8min finished


     rank_test_score  mean_test_score param_bootstrap param_criterion  \
0                  1         0.879972           False            gini   
1                  2         0.879379           False            gini   
2                  3         0.876999            True         entropy   
3                  4         0.875814            True         entropy   
4                  5         0.875217           False         entropy   
..               ...              ...             ...             ...   
535              536         0.835407            True         entropy   
536              537         0.834225            True            gini   
537              538         0.833030           False            gini   
538              539         0.831850            True            gini   
539              540         0.830645            True         entropy   

    param_max_depth param_min_samples_leaf param_min_samples_split  \
0              None                      1           

['rf_presence_classifier.joblib']