In [1]:
import pandas as pd
import numpy as np

# to encode text, aka tokenize documents, to learn the vocabulary and inverse document frequency weightings.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split

# provides a simple way to both tokenize a collection of text documents and build a vocabulary of known words, but also to encode new documents using that vocabulary.
from sklearn.feature_extraction.text import CountVectorizer

# systematically compute word counts using CountVectorizer and them compute the Inverse Document Frequency (IDF) values and only then compute the Tf-idf scores.
from sklearn.feature_extraction.text import TfidfTransformer

# MultinomialNB (multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts, however, in practice, fractional counts such as tf-idf may also work.
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV

# Evaluation metrics
from sklearn import metrics

# joblib is a set of tools to provide lightweight pipelining in Python. It provides utilities for saving and loading Python objects that make use of NumPy data structures, efficiently.
import joblib

# Data Preparation
----------

In [2]:
# ---- import dataset of the enriched pattern type dataset
df = pd.read_csv('enriched_type_df.csv')
df.head(5)

Unnamed: 0,Pattern String,Pattern Type
0,Only 2 left,Low-stock Message
1,Only 3 left,Low-stock Message
2,9 people are viewing this.,Activity Notification
3,5338 people viewed this in the last hour,Activity Notification
4,"Crystal Li in Flushing, United States purchased a",Activity Notification


In [3]:
# ---- information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1057 entries, 0 to 1056
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Pattern String  1057 non-null   object
 1   Pattern Type    1057 non-null   object
dtypes: object(2)
memory usage: 16.6+ KB


In [4]:
# ---- select from the dataset when 'Pattern String' is not NaN values.
df = df[pd.notnull(df["Pattern String"])]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1057 entries, 0 to 1056
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Pattern String  1057 non-null   object
 1   Pattern Type    1057 non-null   object
dtypes: object(2)
memory usage: 24.8+ KB


In [5]:
# Check the distribution of Pattern Type

print(df['Pattern Type'].value_counts())

Activity Notification    412
Low-stock Message        398
Countdown Timer          140
Limited-time Message      83
High-demand Message       24
Name: Pattern Type, dtype: int64


In [6]:
# ---- encode the pattern category type into integers (7 types in total, encoded into integers from 0-6).

df["type_id"] = df['Pattern Type'].factorize()[0]

# ---- Get the mapping of the encoding integers and the pattern categories.
# ---- {'Social Proof': 0, 'Misdirection': 1, 'Urgency': 2, 'Forced Action': 3, 'Obstruction': 4, 'Sneaking': 5, 'Scarcity': 6}

type_id_df = df[['Pattern Type', 'type_id']
                    ].drop_duplicates().sort_values('type_id')
type_to_id = dict(type_id_df.values)
id_to_type = dict(
    type_id_df[['type_id', 'Pattern Type']].values)


# ---- result of the mapping

print(type_to_id)

{'Low-stock Message': 0, 'Activity Notification': 1, 'Countdown Timer': 2, 'High-demand Message': 3, 'Limited-time Message': 4}


In [7]:
# ---- convert a collection of raw documents to a matrix of TF-IDF features; Equivalent to CountVectorizer followed by TfidfTransformer.
# 'sublinear_tf=True' is used to normalise bias of term frequency ("where a term that is more frequent shouldn't be X times as important"). It is set to True to use a logarithmic form for frequency.
# 'norm='l2'' is the default setting of 'norm', used to reduce document length bias, to ensure all our feature vectors have a enclidian norm of 1.
# 'min_df=5', means when building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold (which is 5 here), which is the minimum numbers of documents a word must be present in to be kept.
# 'ngram_range=(1,2)' means unigrams and bigrams will be extracted, means we want to consider both unigrams and bigrams.
# 'stop_words='english'', if a string, it is passed to _check_stop_list and the appropriate stop list is returned. To remove all common pronouns ("a", "the" ...), reducing the number of noisy features.

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2))

features = tfidf.fit_transform(df['Pattern String']).toarray()
labels = df.type_id

print(features.shape)

(1057, 297)


In [8]:
# The result means each of the 1512 pattern strings is represented by 303 features, representing the tf-idf score for different unigrams and bigrams.

N = 3   # every n-gram will give 3 examples

for Type, type_id in sorted(type_to_id.items()):
  features_chi2 = chi2(features, labels == type_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("Pattern Type: '{}':".format(Type))
  print("  . Most correlated unigrams:\n     . {}".format('\n     . '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n     . {}".format('\n     . '.join(bigrams[-N:])))

Pattern Type: 'Activity Notification':
  . Most correlated unigrams:
     . only
     . left
     . sold
  . Most correlated bigrams:
     . left in
     . in stock
     . only left
Pattern Type: 'Countdown Timer':
  . Most correlated unigrams:
     . 09
     . 00
     . ends
  . Most correlated bigrams:
     . offer ends
     . reserved for
     . ends in
Pattern Type: 'High-demand Message':
  . Most correlated unigrams:
     . fast
     . high
     . demand
  . Most correlated bigrams:
     . selling fast
     . in high
     . high demand
Pattern Type: 'Limited-time Message':
  . Most correlated unigrams:
     . free
     . limited
     . time
  . Most correlated bigrams:
     . time offer
     . time only
     . limited time
Pattern Type: 'Low-stock Message':
  . Most correlated unigrams:
     . stock
     . only
     . left
  . Most correlated bigrams:
     . left in
     . in stock
     . only left


# Training Preparation
----------

In [9]:
# ----------- Split the dataset into training and testing ------
String_train, String_test, Type_train, Type_test = train_test_split(
    df['Pattern String'], df['Pattern Type'], train_size=.6)

In [10]:
# Distribution of training data

print(Type_train.value_counts())

# Distribution of testing data

print(Type_test.value_counts())

Low-stock Message        242
Activity Notification    237
Countdown Timer           86
Limited-time Message      51
High-demand Message       18
Name: Pattern Type, dtype: int64
Activity Notification    175
Low-stock Message        156
Countdown Timer           54
Limited-time Message      32
High-demand Message        6
Name: Pattern Type, dtype: int64


In [11]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(Type_train)
y_train = encoder.transform(Type_train)
y_test = encoder.transform(Type_test)

# check the mapping of encoding results (from 0 to 6 representing 'Forced Action', 'Misdirection'......)

print(list(encoder.classes_))

['Activity Notification', 'Countdown Timer', 'High-demand Message', 'Limited-time Message', 'Low-stock Message']


In [12]:
# Check the frequency distribution of the training pattern category with pattern category names.

(unique, counts) = np.unique(Type_train, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print(frequencies)

[['Activity Notification' 237]
 ['Countdown Timer' 86]
 ['High-demand Message' 18]
 ['Limited-time Message' 51]
 ['Low-stock Message' 242]]


In [13]:
# Check the frequency distribution of the encoded training pattern category with encoded integers.

(unique, counts) = np.unique(y_train, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print(frequencies)

[[  0 237]
 [  1  86]
 [  2  18]
 [  3  51]
 [  4 242]]


In [14]:
# Check the frequency distribution of the encoded testing pattern category with encoded integers.

(unique, counts) = np.unique(y_test, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print(frequencies)

[[  0 175]
 [  1  54]
 [  2   6]
 [  3  32]
 [  4 156]]


### Text Encoding
--------

In [15]:
# get the word count vector of the pattern string to encode the pattern string.

cv = TfidfVectorizer()
X_train = cv.fit_transform(String_train)

In [16]:

# save the CountVectorizer to disk
joblib.dump(cv, 'type_TfidfVectorizer.joblib')

['type_TfidfVectorizer.joblib']

# Model Training
------

In [17]:
# Five models are tested:
# -- Logistic Regression
# -- Linear Support Vector Machine
# -- Random Forest
# -- Decision Tree
# -- Multinomial Naive Bayes

classifiers = [LogisticRegression(),LinearSVC(), RandomForestClassifier(), DecisionTreeClassifier(), MultinomialNB()]

# Calculate the accuracies of different classifiers using default settings.

acc = []
cm = []

for clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(cv.transform(String_test))
    acc.append(metrics.accuracy_score(y_test, y_pred))
    cm.append(metrics.confusion_matrix(y_test, y_pred))

# List the accuracies of different classifiers.

for i in range(len(classifiers)):
    print(f"{classifiers[i]} accuracy: {acc[i]}")
    print(f"Confusion Matris: {cm[i]}")

LogisticRegression() accuracy: 0.9598108747044918
Confusion Matris: [[175   0   0   0   0]
 [  6  44   0   2   2]
 [  3   0   3   0   0]
 [  0   3   0  29   0]
 [  0   0   0   1 155]]
LinearSVC() accuracy: 0.966903073286052
Confusion Matris: [[175   0   0   0   0]
 [  4  45   0   3   2]
 [  0   0   6   0   0]
 [  0   3   0  29   0]
 [  0   0   1   1 154]]
RandomForestClassifier() accuracy: 0.966903073286052
Confusion Matris: [[174   0   1   0   0]
 [  3  48   0   1   2]
 [  2   0   4   0   0]
 [  0   3   0  29   0]
 [  1   0   0   1 154]]
DecisionTreeClassifier() accuracy: 0.91725768321513
Confusion Matris: [[160  12   2   0   1]
 [  7  43   0   0   4]
 [  0   3   3   0   0]
 [  0   4   0  28   0]
 [  1   0   0   1 154]]
MultinomialNB() accuracy: 0.9290780141843972
Confusion Matris: [[174   0   0   0   1]
 [  3  43   0   1   7]
 [  1   0   0   0   5]
 [  0   5   0  21   6]
 [  0   0   0   1 155]]


# Logistic Regression
------

In [18]:
clf_lr = LogisticRegression().fit(X_train, y_train)

print('Parameters of the classifier:\n{}\n'.format(clf_lr.get_params()))

y_pred = clf_lr.predict(cv.transform(String_test))

print("Accuracy:{}\n".format(metrics.accuracy_score(y_pred, y_test)))

(unique, counts) = np.unique(y_pred, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print('The distribution of predicted result of default model:{}\n'.format(frequencies))

cm = metrics.confusion_matrix(y_test, y_pred)
print('Confusion Matrix of the result:{}\n'.format(cm))

# Parameter tunning
param_grid = {'penalty':['l1','l2'], 
              'solver':['lbfgs','newton-cg','sag']}

gs = GridSearchCV(clf_lr, param_grid, cv=5,
                  verbose=1, n_jobs=-1)

best_lr = gs.fit(X_train, y_train)

scores_df = pd.DataFrame(best_lr.cv_results_)
scores_df = scores_df.sort_values(by=['rank_test_score']).reset_index(drop='index')
scores_df [['rank_test_score', 'mean_test_score', 'param_penalty', 'param_solver']]

Parameters of the classifier:
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}

Accuracy:0.9598108747044918

The distribution of predicted result of default model:[[  0 184]
 [  1  47]
 [  2   3]
 [  3  32]
 [  4 157]]

Confusion Matrix of the result:[[175   0   0   0   0]
 [  6  44   0   2   2]
 [  3   0   3   0   0]
 [  0   3   0  29   0]
 [  0   0   0   1 155]]

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    1.7s finished


Unnamed: 0,rank_test_score,mean_test_score,param_penalty,param_solver
0,1,0.922685,l2,lbfgs
1,1,0.922685,l2,newton-cg
2,1,0.922685,l2,sag
3,4,,l1,lbfgs
4,5,,l1,newton-cg
5,6,,l1,sag


In [19]:
# Parameters of the best model

print('Parameters of the classifier:\n{}\n'.format(best_lr.best_params_))

y_pred_best = best_lr.predict(cv.transform(String_test))

print("Accuracy:{}\n".format(metrics.accuracy_score(y_pred_best, y_test)))

(unique, counts) = np.unique(y_pred_best, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print('The distribution of predicted result of best model:{}'.format(frequencies))

# save the model to local disk

joblib.dump(best_lr, 'lr_type_classifier.joblib')

Parameters of the classifier:
{'penalty': 'l2', 'solver': 'lbfgs'}

Accuracy:0.9598108747044918

The distribution of predicted result of best model:[[  0 184]
 [  1  47]
 [  2   3]
 [  3  32]
 [  4 157]]


['lr_type_classifier.joblib']

# Random Forest
------

In [20]:
clf_rf = RandomForestClassifier().fit(X_train, y_train)

print('Parameters of the classifier:\n{}\n'.format(clf_rf.get_params()))

y_pred = clf_rf.predict(cv.transform(String_test))

print("Accuracy:{}\n".format(metrics.accuracy_score(y_pred, y_test)))

(unique, counts) = np.unique(y_pred, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print('The distribution of predicted result of default model:{}\n'.format(frequencies))

cm = metrics.confusion_matrix(y_test, y_pred)
print('Confusion Matrix of the result:{}\n'.format(cm))

# Parameter tunning
param_grid = {'bootstrap':[True,False], 
              'criterion':['gini','entropy'],
              'max_depth':[10,20,30,40,50, None],
              'min_samples_leaf':[1,2,4],
              'min_samples_split':[2,5,10],
              'n_estimators':[100,200,300]}

gs = GridSearchCV(clf_rf, param_grid, cv=5,
                  verbose=1, n_jobs=-1)

best_rf = gs.fit(X_train, y_train)

scores_df = pd.DataFrame(best_rf.cv_results_)
scores_df = scores_df.sort_values(by=['rank_test_score']).reset_index(drop='index')
scores_df [['rank_test_score', 'mean_test_score', 'param_bootstrap', 'param_criterion',
            'param_max_depth','param_min_samples_leaf','param_min_samples_split','param_n_estimators']]

Parameters of the classifier:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}

Accuracy:0.966903073286052

The distribution of predicted result of default model:[[  0 179]
 [  1  52]
 [  2   4]
 [  3  31]
 [  4 157]]

Confusion Matrix of the result:[[174   1   0   0   0]
 [  3  48   0   1   2]
 [  1   0   4   0   1]
 [  0   3   0  29   0]
 [  1   0   0   1 154]]

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:   48.5s
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 3176 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 3240 out of 3240 | elapsed:  3.6min finished


Unnamed: 0,rank_test_score,mean_test_score,param_bootstrap,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_n_estimators
0,1,0.963692,False,gini,,1,5,200
1,1,0.963692,False,gini,50,1,5,200
2,3,0.963667,False,gini,50,1,5,100
3,4,0.962117,False,gini,,1,2,200
4,4,0.962117,False,gini,50,1,2,200
...,...,...,...,...,...,...,...,...
643,644,0.858005,True,gini,10,4,2,200
644,645,0.856443,True,entropy,10,4,2,200
645,646,0.854868,True,gini,10,4,10,300
646,647,0.850156,True,gini,10,4,2,100


In [21]:
# Parameters of the best model

print('Parameters of the classifier:\n{}\n'.format(best_rf.best_params_))

y_pred_best = best_rf.predict(cv.transform(String_test))

print("Accuracy:{}\n".format(metrics.accuracy_score(y_pred_best, y_test)))

(unique, counts) = np.unique(y_pred_best, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print('The distribution of predicted result of best model:{}'.format(frequencies))

# save the model to local disk

joblib.dump(best_rf, 'rf_type_classifier.joblib')

Parameters of the classifier:
{'bootstrap': False, 'criterion': 'gini', 'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}

Accuracy:0.9692671394799054

The distribution of predicted result of best model:[[  0 180]
 [  1  51]
 [  2   4]
 [  3  30]
 [  4 158]]


['rf_type_classifier.joblib']

# SVM
----

In [22]:
clf_svm = LinearSVC().fit(X_train, y_train)

print('Parameters of the classifier:\n{}\n'.format(clf_svm.get_params()))

y_pred = clf_svm.predict(cv.transform(String_test))

print("Accuracy:{}\n".format(metrics.accuracy_score(y_pred, y_test)))

(unique, counts) = np.unique(y_pred, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print('The distribution of predicted result of default model:{}\n'.format(frequencies))

cm = metrics.confusion_matrix(y_test, y_pred)
print('Confusion Matrix of the result:{}\n'.format(cm))

# Parameter tunning
param_grid = {'penalty': ['l1', 'l2'],
              'C': [0.1, 1, 5, 10]}

gs = GridSearchCV(clf_svm, param_grid, cv=5,
                  verbose=1, n_jobs=-1)

best_svm = gs.fit(X_train, y_train)

scores_df = pd.DataFrame(best_svm.cv_results_)
scores_df = scores_df.sort_values(by=['rank_test_score']).reset_index(drop='index')
print(scores_df [['rank_test_score', 'mean_test_score', 'param_penalty', 'param_C']])

Parameters of the classifier:
{'C': 1.0, 'class_weight': None, 'dual': True, 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'max_iter': 1000, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': None, 'tol': 0.0001, 'verbose': 0}

Accuracy:0.966903073286052

The distribution of predicted result of default model:[[  0 179]
 [  1  48]
 [  2   7]
 [  3  33]
 [  4 156]]

Confusion Matrix of the result:[[175   0   0   0   0]
 [  4  45   0   3   2]
 [  0   0   6   0   0]
 [  0   3   0  29   0]
 [  0   0   1   1 154]]

Fitting 5 folds for each of 8 candidates, totalling 40 fits
   rank_test_score  mean_test_score param_penalty param_C
0                1         0.965267            l2       1
1                2         0.957380            l2       5
2                3         0.955806            l2      10
3                4         0.930571            l2     0.1
4                5              NaN            l1     0.1
5                6              NaN            

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.1s finished


In [23]:
print('Parameters of the classifier:\n{}\n'.format(best_svm.best_params_))

y_pred_best = best_svm.predict(cv.transform(String_test))

print("Accuracy:{}\n".format(metrics.accuracy_score(y_pred_best, y_test)))

(unique, counts) = np.unique(y_pred_best, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print('The distribution of predicted result of best model:{}'.format(frequencies))


Parameters of the classifier:
{'C': 1, 'penalty': 'l2'}

Accuracy:0.966903073286052

The distribution of predicted result of best model:[[  0 179]
 [  1  48]
 [  2   7]
 [  3  33]
 [  4 156]]


In [24]:
# save the model to local disk

joblib.dump(best_svm, 'svm_type_classifier.joblib')

['svm_type_classifier.joblib']

# Multinomial Naive Bayes
-----

In [25]:
clf_mnb = MultinomialNB().fit(X_train, y_train)
print('Parameters of the classifier:\n{}\n'.format(clf_mnb.get_params()))

y_pred = clf_mnb.predict(cv.transform(String_test))

print("Accuracy:{}\n".format(metrics.accuracy_score(y_pred, y_test)))

(unique, counts) = np.unique(y_pred, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print('The distribution of predicted result of default model:{}'.format(frequencies))

# Parameter tunning
param_grid = {'alpha':[0,1],
              'fit_prior':[True, False]}

from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(clf_mnb,param_grid,cv=5,
                      verbose = 1, n_jobs = -1)

best_mnb = gs.fit(X_train,y_train)

scores_df = pd.DataFrame(best_mnb.cv_results_)
scores_df = scores_df.sort_values(by=['rank_test_score']).reset_index(drop='index')
scores_df [['rank_test_score', 'mean_test_score', 'param_alpha', 'param_fit_prior']]

Parameters of the classifier:
{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

Accuracy:0.9290780141843972

The distribution of predicted result of default model:[[  0 178]
 [  1  48]
 [  3  23]
 [  4 174]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished


Unnamed: 0,rank_test_score,mean_test_score,param_alpha,param_fit_prior
0,1,0.927447,1,False
1,2,0.914836,0,True
2,3,0.90065,0,False
3,4,0.900637,1,True


In [26]:
y_pred_best = best_mnb.predict(cv.transform(String_test))

print("Accuracy:{}\n".format(metrics.accuracy_score(y_pred_best, y_test)))

(unique, counts) = np.unique(y_pred_best, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print('The distribution of predicted result of the best model:{}'.format(frequencies))

 # save the model to local disk

joblib.dump(best_mnb, 'mnb_type_classifier.joblib')

Accuracy:0.9574468085106383

The distribution of predicted result of the best model:[[  0 176]
 [  1  49]
 [  2   5]
 [  3  37]
 [  4 156]]


['mnb_type_classifier.joblib']