In [1]:
import pandas as pd
import numpy as np

# to encode text, aka tokenize documents, to learn the vocabulary and inverse document frequency weightings.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split

# provides a simple way to both tokenize a collection of text documents and build a vocabulary of known words, but also to encode new documents using that vocabulary.
from sklearn.feature_extraction.text import CountVectorizer

# systematically compute word counts using CountVectorizer and them compute the Inverse Document Frequency (IDF) values and only then compute the Tf-idf scores.
from sklearn.feature_extraction.text import TfidfTransformer

# MultinomialNB (multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts, however, in practice, fractional counts such as tf-idf may also work.
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

# Evaluation metrics
from sklearn import metrics

# joblib is a set of tools to provide lightweight pipelining in Python. It provides utilities for saving and loading Python objects that make use of NumPy data structures, efficiently.
import joblib

# Data Preparation
----------

In [6]:
# ---- import dataset from the Princeton Article
df = pd.read_csv('dark_patterns.csv')
df.head(5)

Unnamed: 0,Pattern String,Comment,Pattern Category,Pattern Type,Where in website?,Deceptive?,Website Page
0,Collin P. from Grandview Missouri just bought ...,Periodic popup,Social Proof,Activity Notification,Product Page,No,https://alaindupetit.com/collections/all-suits...
1,"Faith in Glendale, United States purchased a C...",Periodic popup,Social Proof,Activity Notification,Product Page,No,https://bonescoffee.com/products/strawberry-ch...
2,Sharmeen Atif From Karachi just bought Stylish...,Periodic popup,Social Proof,Activity Notification,Product Page,No,https://brandsego.com/collections/under-rs-99/...
3,9 people are viewing this.,Product detail,Social Proof,Activity Notification,Product Page,No,https://brightechshop.com/products/ambience-so...
4,5338 people viewed this in the last hour,Periodic popup,Social Proof,Activity Notification,Product Page,No,https://bumpboxes.com/


In [7]:
# ---- information of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1818 entries, 0 to 1817
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Pattern String     1512 non-null   object
 1   Comment            1798 non-null   object
 2   Pattern Category   1818 non-null   object
 3   Pattern Type       1818 non-null   object
 4   Where in website?  1818 non-null   object
 5   Deceptive?         1818 non-null   object
 6   Website Page       1818 non-null   object
dtypes: object(7)
memory usage: 99.5+ KB


In [8]:
# ---- select from the dataset when 'Pattern String' is not NaN values.
df = df[pd.notnull(df["Pattern String"])]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1512 entries, 0 to 1817
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Pattern String     1512 non-null   object
 1   Comment            1494 non-null   object
 2   Pattern Category   1512 non-null   object
 3   Pattern Type       1512 non-null   object
 4   Where in website?  1512 non-null   object
 5   Deceptive?         1512 non-null   object
 6   Website Page       1512 non-null   object
dtypes: object(7)
memory usage: 94.5+ KB


In [9]:
# ---- select only "Pattern String" and "Pattern Category" 2 columns to be the sub-dataset.
col = ["Pattern String", "Pattern Type"]
df = df[col]
df.head(5)

Unnamed: 0,Pattern String,Pattern Type
0,Collin P. from Grandview Missouri just bought ...,Activity Notification
1,"Faith in Glendale, United States purchased a C...",Activity Notification
2,Sharmeen Atif From Karachi just bought Stylish...,Activity Notification
3,9 people are viewing this.,Activity Notification
4,5338 people viewed this in the last hour,Activity Notification


In [10]:
# Check the distribution of Pattern Type

print(df['Pattern Type'].value_counts())

Low-stock Message                   631
Activity Notification               313
Confirmshaming                      169
Countdown Timer                     149
Limited-time Message                 88
High-demand Message                  47
Pressured Selling                    45
Hard to Cancel                       30
Visual Interference                  14
Trick Questions                       9
Hidden Subscription                   6
Forced Enrollment                     4
Sneak into Basket                     3
Hidden Costs                          3
Testimonials of Uncertain Origin      1
Name: Pattern Type, dtype: int64


In [11]:
# Only keep the Pattern Type we need to train the model

types = ['Low-stock Message','Activity Notification',
         'Countdown Timer','Limited-time Message','High-demand Message']

data = df[df['Pattern Type'].isin(types)]

data.head(5)

Unnamed: 0,Pattern String,Pattern Type
0,Collin P. from Grandview Missouri just bought ...,Activity Notification
1,"Faith in Glendale, United States purchased a C...",Activity Notification
2,Sharmeen Atif From Karachi just bought Stylish...,Activity Notification
3,9 people are viewing this.,Activity Notification
4,5338 people viewed this in the last hour,Activity Notification


In [16]:
# check the information of the data

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1228 entries, 0 to 1697
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Pattern String  1228 non-null   object
 1   Pattern Type    1228 non-null   object
dtypes: object(2)
memory usage: 28.8+ KB


In [17]:
# Check the distribution of Pattern Type

print(data['Pattern Type'].value_counts())

Low-stock Message        631
Activity Notification    313
Countdown Timer          149
Limited-time Message      88
High-demand Message       47
Name: Pattern Type, dtype: int64


In [19]:
# ---- encode the pattern category type into integers (7 types in total, encoded into integers from 0-6).

data["type_id"] = data['Pattern Type'].factorize()[0]

# ---- Get the mapping of the encoding integers and the pattern categories.
# ---- {'Social Proof': 0, 'Misdirection': 1, 'Urgency': 2, 'Forced Action': 3, 'Obstruction': 4, 'Sneaking': 5, 'Scarcity': 6}

type_id_df = data[['Pattern Type', 'type_id']
                    ].drop_duplicates().sort_values('type_id')
type_to_id = dict(type_id_df.values)
id_to_type = dict(
    type_id_df[['type_id', 'Pattern Type']].values)


# ---- result of the mapping

print(type_to_id)

{'Activity Notification': 0, 'Countdown Timer': 1, 'High-demand Message': 2, 'Limited-time Message': 3, 'Low-stock Message': 4}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["type_id"] = data['Pattern Type'].factorize()[0]


In [20]:
# ---- convert a collection of raw documents to a matrix of TF-IDF features; Equivalent to CountVectorizer followed by TfidfTransformer.
# 'sublinear_tf=True' is used to normalise bias of term frequency ("where a term that is more frequent shouldn't be X times as important"). It is set to True to use a logarithmic form for frequency.
# 'norm='l2'' is the default setting of 'norm', used to reduce document length bias, to ensure all our feature vectors have a enclidian norm of 1.
# 'min_df=5', means when building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold (which is 5 here), which is the minimum numbers of documents a word must be present in to be kept.
# 'ngram_range=(1,2)' means unigrams and bigrams will be extracted, means we want to consider both unigrams and bigrams.
# 'stop_words='english'', if a string, it is passed to _check_stop_list and the appropriate stop list is returned. To remove all common pronouns ("a", "the" ...), reducing the number of noisy features.

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(data['Pattern String']).toarray()
labels = data.type_id

print(features.shape)

(1228, 195)


In [21]:
# The result means each of the 1512 pattern strings is represented by 303 features, representing the tf-idf score for different unigrams and bigrams.

N = 3   # every n-gram will give 3 examples

for Type, type_id in sorted(type_to_id.items()):
  features_chi2 = chi2(features, labels == type_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("Pattern Type: '{}':".format(Type))
  print("  . Most correlated unigrams:\n     . {}".format('\n     . '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n     . {}".format('\n     . '.join(bigrams[-N:])))

Pattern Type: 'Activity Notification':
  . Most correlated unigrams:
     . bought
     . left
     . purchased
  . Most correlated bigrams:
     . just bought
     . states purchased
     . united states
Pattern Type: 'Countdown Timer':
  . Most correlated unigrams:
     . minutes
     . 09
     . ends
  . Most correlated bigrams:
     . sale ends
     . reserved 09
     . order reserved
Pattern Type: 'High-demand Message':
  . Most correlated unigrams:
     . worries
     . high
     . demand
  . Most correlated bigrams:
     . ordered high
     . reserved order
     . high demand
Pattern Type: 'Limited-time Message':
  . Most correlated unigrams:
     . offer
     . limited
     . time
  . Most correlated bigrams:
     . free shipping
     . time offer
     . limited time
Pattern Type: 'Low-stock Message':
  . Most correlated unigrams:
     . purchased
     . stock
     . left
  . Most correlated bigrams:
     . hurry left
     . limited time
     . left stock


# Training Preparation
----------

In [23]:
# ----------- Split the dataset into training and testing ------
String_train, String_test, Type_train, Type_test = train_test_split(
    data['Pattern String'], data['Pattern Type'], train_size=.6)

In [27]:
# Distribution of training data

print(Type_train.value_counts())

# Distribution of testing data

print(Type_test.value_counts())

Low-stock Message        378
Activity Notification    183
Countdown Timer           96
Limited-time Message      50
High-demand Message       29
Name: Pattern Type, dtype: int64
Low-stock Message        253
Activity Notification    130
Countdown Timer           53
Limited-time Message      38
High-demand Message       18
Name: Pattern Type, dtype: int64


In [29]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(Type_train)
y_train = encoder.transform(Type_train)
y_test = encoder.transform(Type_test)

# check the mapping of encoding results (from 0 to 6 representing 'Forced Action', 'Misdirection'......)

print(list(encoder.classes_))

['Activity Notification', 'Countdown Timer', 'High-demand Message', 'Limited-time Message', 'Low-stock Message']


In [30]:
# Check the frequency distribution of the training pattern category with pattern category names.

(unique, counts) = np.unique(Type_train, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print(frequencies)

[['Activity Notification' 183]
 ['Countdown Timer' 96]
 ['High-demand Message' 29]
 ['Limited-time Message' 50]
 ['Low-stock Message' 378]]


In [31]:
# Check the frequency distribution of the encoded training pattern category with encoded integers.

(unique, counts) = np.unique(y_train, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print(frequencies)

[[  0 183]
 [  1  96]
 [  2  29]
 [  3  50]
 [  4 378]]


In [32]:
# Check the frequency distribution of the encoded testing pattern category with encoded integers.

(unique, counts) = np.unique(y_test, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print(frequencies)

[[  0 130]
 [  1  53]
 [  2  18]
 [  3  38]
 [  4 253]]


### Text Encoding
--------

In [33]:
# get the word count vector of the pattern string to encode the pattern string.

cv = CountVectorizer()
X_train = cv.fit_transform(String_train)

In [34]:

# save the CountVectorizer to disk
joblib.dump(cv, 'category_CountVectorizer.joblib')

['category_CountVectorizer.joblib']

# Model Training
------

In [36]:
# Five models are tested:
# -- Logistic Regression
# -- Linear Support Vector Machine
# -- Random Forest
# -- Decision Tree
# -- Multinomial Naive Bayes

classifiers = [LogisticRegression(),LinearSVC(), RandomForestClassifier(), DecisionTreeClassifier(), MultinomialNB()]

# Calculate the accuracies of different classifiers using default settings.

acc = []
cm = []

for clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(cv.transform(String_test))
    acc.append(metrics.accuracy_score(y_test, y_pred))
    cm.append(metrics.confusion_matrix(y_test, y_pred))

# List the accuracies of different classifiers.

for i in range(len(classifiers)):
    print(f"{classifiers[i]} accuracy: {acc[i]}")
    print(f"Confusion Matris: {cm[i]}")

LogisticRegression() accuracy: 0.975609756097561
Confusion Matris: [[127   1   0   2   0]
 [  0  51   0   0   2]
 [  1   1  16   0   0]
 [  0   2   0  36   0]
 [  1   2   0   0 250]]
LinearSVC() accuracy: 0.967479674796748
Confusion Matris: [[127   1   0   1   1]
 [  0  50   0   1   2]
 [  0   1  16   0   1]
 [  0   1   0  35   2]
 [  1   3   0   1 248]]
RandomForestClassifier() accuracy: 0.9735772357723578
Confusion Matris: [[128   1   0   1   0]
 [  0  50   0   0   3]
 [  1   1  16   0   0]
 [  0   2   0  35   1]
 [  1   2   0   0 250]]
DecisionTreeClassifier() accuracy: 0.9573170731707317
Confusion Matris: [[124   4   1   0   1]
 [  3  47   1   0   2]
 [  0   1  16   0   1]
 [  0   1   0  35   2]
 [  3   1   0   0 249]]
MultinomialNB() accuracy: 0.9654471544715447
Confusion Matris: [[126   0   0   2   2]
 [  0  47   0   1   5]
 [  1   1  16   0   0]
 [  0   1   0  35   2]
 [  0   1   1   0 251]]


# Logistic Regression
------

In [44]:
clf_lr = LogisticRegression().fit(X_train, y_train)

print('Parameters of the classifier:\n{}\n'.format(clf_lr.get_params()))

y_pred = clf_lr.predict(cv.transform(String_test))

print("Accuracy:{}\n".format(metrics.accuracy_score(y_pred, y_test)))

(unique, counts) = np.unique(y_pred, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print('The distribution of predicted result of default model:{}\n'.format(frequencies))

cm = metrics.confusion_matrix(y_test, y_pred)
print('Confusion Matrix of the result:{}\n'.format(cm))

# Parameter tunning
param_grid = {'penalty':['l1','l2'], 
              'solver':['lbfgs','newton-cg','sag']}

gs = GridSearchCV(clf_lr, param_grid, cv=5,
                  verbose=1, n_jobs=-1)

best_lr = gs.fit(X_train, y_train)

scores_df = pd.DataFrame(best_lr.cv_results_)
scores_df = scores_df.sort_values(by=['rank_test_score']).reset_index(drop='index')
scores_df [['rank_test_score', 'mean_test_score', 'param_penalty', 'param_solver']]

Parameters of the classifier:
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}

Accuracy:0.975609756097561

The distribution of predicted result of default model:[[  0 129]
 [  1  57]
 [  2  16]
 [  3  38]
 [  4 252]]

Confusion Matrix of the result:[[127   1   0   2   0]
 [  0  51   0   0   2]
 [  1   1  16   0   0]
 [  0   2   0  36   0]
 [  1   2   0   0 250]]

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.2s finished


Unnamed: 0,rank_test_score,mean_test_score,param_penalty,param_solver
0,1,0.974177,l2,sag
1,2,0.970105,l2,lbfgs
2,2,0.970105,l2,newton-cg
3,4,,l1,lbfgs
4,5,,l1,newton-cg
5,6,,l1,sag


In [45]:
# Parameters of the best model

print('Parameters of the classifier:\n{}\n'.format(best_lr.best_params_))

y_pred_best = best_lr.predict(cv.transform(String_test))

print("Accuracy:{}\n".format(metrics.accuracy_score(y_pred_best, y_test)))

(unique, counts) = np.unique(y_pred_best, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print('The distribution of predicted result of best model:{}'.format(frequencies))

# save the model to local disk

joblib.dump(best_lr, 'lr_category_classifier.joblib')

Parameters of the classifier:
{'penalty': 'l2', 'solver': 'sag'}

Accuracy:0.9735772357723578

The distribution of predicted result of best model:[[  0 129]
 [  1  56]
 [  2  16]
 [  3  39]
 [  4 252]]


['lr_category_classifier.joblib']

# Random Forest
------

In [42]:
clf_rf = RandomForestClassifier().fit(X_train, y_train)

print('Parameters of the classifier:\n{}\n'.format(clf_rf.get_params()))

y_pred = clf_rf.predict(cv.transform(String_test))

print("Accuracy:{}\n".format(metrics.accuracy_score(y_pred, y_test)))

(unique, counts) = np.unique(y_pred, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print('The distribution of predicted result of default model:{}\n'.format(frequencies))

cm = metrics.confusion_matrix(y_test, y_pred)
print('Confusion Matrix of the result:{}\n'.format(cm))

# Parameter tunning
param_grid = {'bootstrap':[True,False], 
              'criterion':['gini','entropy'],
              'max_depth':[10,20,30,40,50,60,70,80,90,100, None],
              'min_samples_leaf':[1,2,4],
              'min_samples_split':[2,5,10],
              'n_estimators':[100,200,300,400,500,600]}

gs = GridSearchCV(clf_rf, param_grid, cv=5,
                  verbose=1, n_jobs=-1)

best_rf = gs.fit(X_train, y_train)

scores_df = pd.DataFrame(best_rf.cv_results_)
scores_df = scores_df.sort_values(by=['rank_test_score']).reset_index(drop='index')
scores_df [['rank_test_score', 'mean_test_score', 'param_bootstrap', 'param_criterion',
            'param_max_depth','param_min_samples_leaf','param_min_samples_split','param_n_estimators']]

Parameters of the classifier:
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}

Accuracy:0.9715447154471545

The distribution of predicted result of default model:[[  0 130]
 [  1  58]
 [  2  15]
 [  3  36]
 [  4 253]]

Confusion Matrix of the result:[[127   2   0   1   0]
 [  0  51   0   0   2]
 [  2   1  15   0   0]
 [  0   2   0  35   1]
 [  1   2   0   0 250]]

Fitting 5 folds for each of 2376 candidates, totalling 11880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   18.2s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   44.3s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 3176 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 4026 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 4976 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 6026 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 7176 tasks      | elapsed: 14.9min
[Parallel(n_jobs=-1)]: Done 8426 tasks      | elapsed: 18.5min
[Parallel(n_jobs=-1)]: Done 9776 tasks      | elapsed: 21.1min
[Parallel(n_jobs=-1)]: Done 11226 tasks      

Unnamed: 0,rank_test_score,mean_test_score,param_bootstrap,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_n_estimators
0,1,0.978250,False,gini,50,1,2,300
1,1,0.978250,True,gini,100,1,2,100
2,3,0.976898,True,gini,60,1,2,200
3,4,0.976898,False,entropy,60,1,2,500
4,4,0.976898,False,gini,60,1,2,500
...,...,...,...,...,...,...,...,...
2371,2372,0.846442,True,entropy,10,4,2,300
2372,2373,0.846415,True,gini,10,4,2,500
2373,2374,0.845155,True,entropy,60,4,2,100
2374,2375,0.845109,True,gini,10,4,2,200


In [43]:
# Parameters of the best model

print('Parameters of the classifier:\n{}\n'.format(best_rf.best_params_))

y_pred_best = best_rf.predict(cv.transform(String_test))

print("Accuracy:{}\n".format(metrics.accuracy_score(y_pred_best, y_test)))

(unique, counts) = np.unique(y_pred_best, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print('The distribution of predicted result of best model:{}'.format(frequencies))

# save the model to local disk

joblib.dump(best_rf, 'rf_category_classifier.joblib')

Parameters of the classifier:
{'bootstrap': True, 'criterion': 'gini', 'max_depth': 100, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

Accuracy:0.967479674796748

The distribution of predicted result of best model:[[  0 127]
 [  1  57]
 [  2  16]
 [  3  39]
 [  4 253]]


['rf_category_classifier.joblib']

# SVM
----

In [39]:
clf_svm = LinearSVC().fit(X_train, y_train)

print('Parameters of the classifier:\n{}\n'.format(clf_svm.get_params()))

y_pred = clf_svm.predict(cv.transform(String_test))

print("Accuracy:{}\n".format(metrics.accuracy_score(y_pred, y_test)))

(unique, counts) = np.unique(y_pred, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print('The distribution of predicted result of default model:{}\n'.format(frequencies))

cm = metrics.confusion_matrix(y_test, y_pred)
print('Confusion Matrix of the result:{}\n'.format(cm))

# Parameter tunning
param_grid = {'penalty': ['l1', 'l2'],
              'C': [0.1, 1, 5, 10]}

gs = GridSearchCV(clf_svm, param_grid, cv=5,
                  verbose=1, n_jobs=-1)

best_svm = gs.fit(X_train, y_train)

scores_df = pd.DataFrame(best_svm.cv_results_)
scores_df = scores_df.sort_values(by=['rank_test_score']).reset_index(drop='index')
print(scores_df [['rank_test_score', 'mean_test_score', 'param_penalty', 'param_C']])

Parameters of the classifier:
{'C': 1.0, 'class_weight': None, 'dual': True, 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'max_iter': 1000, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': None, 'tol': 0.0001, 'verbose': 0}

Accuracy:0.967479674796748

The distribution of predicted result of default model:[[  0 128]
 [  1  56]
 [  2  16]
 [  3  38]
 [  4 254]]

Confusion Matrix of the result:[[127   1   0   1   1]
 [  0  50   0   1   2]
 [  0   1  16   0   1]
 [  0   1   0  35   2]
 [  1   3   0   1 248]]

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


   rank_test_score  mean_test_score param_penalty param_C
0                1         0.978259            l2       5
1                1         0.978259            l2      10
2                3         0.976898            l2       1
3                4         0.971465            l2     0.1
4                5              NaN            l1     0.1
5                6              NaN            l1       1
6                7              NaN            l1       5
7                8              NaN            l1      10


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    1.9s finished


In [40]:
print('Parameters of the classifier:\n{}\n'.format(best_svm.best_params_))

y_pred_best = best_svm.predict(cv.transform(String_test))

print("Accuracy:{}\n".format(metrics.accuracy_score(y_pred_best, y_test)))

(unique, counts) = np.unique(y_pred_best, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print('The distribution of predicted result of best model:{}'.format(frequencies))


Parameters of the classifier:
{'C': 5, 'penalty': 'l2'}

Accuracy:0.9715447154471545

The distribution of predicted result of best model:[[  0 128]
 [  1  56]
 [  2  16]
 [  3  39]
 [  4 253]]


In [41]:
# save the model to local disk

joblib.dump(best_svm, 'svm_category_classifier.joblib')

['svm_category_classifier.joblib']

# Multinomial Naive Bayes
-----

In [37]:
clf_mnb = MultinomialNB().fit(X_train, y_train)
print('Parameters of the classifier:\n{}\n'.format(clf_mnb.get_params()))

y_pred = clf_mnb.predict(cv.transform(String_test))

print("Accuracy:{}\n".format(metrics.accuracy_score(y_pred, y_test)))

(unique, counts) = np.unique(y_pred, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print('The distribution of predicted result of default model:{}'.format(frequencies))

# Parameter tunning
param_grid = {'alpha':[0,1],
              'fit_prior':[True, False]}

from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(clf_mnb,param_grid,cv=5,
                      verbose = 1, n_jobs = -1)

best_mnb = gs.fit(X_train,y_train)

scores_df = pd.DataFrame(best_mnb.cv_results_)
scores_df = scores_df.sort_values(by=['rank_test_score']).reset_index(drop='index')
scores_df [['rank_test_score', 'mean_test_score', 'param_alpha', 'param_fit_prior']]

Parameters of the classifier:
{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

Accuracy:0.9654471544715447

The distribution of predicted result of default model:[[  0 127]
 [  1  50]
 [  2  17]
 [  3  38]
 [  4 260]]
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed:    2.2s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    2.2s finished


Unnamed: 0,rank_test_score,mean_test_score,param_alpha,param_fit_prior
0,1,0.966051,1,True
1,2,0.957933,1,False
2,3,0.955157,0,True
3,4,0.953806,0,False


In [38]:
y_pred_best = best_mnb.predict(cv.transform(String_test))

print("Accuracy:{}\n".format(metrics.accuracy_score(y_pred_best, y_test)))

(unique, counts) = np.unique(y_pred_best, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print('The distribution of predicted result of the best model:{}'.format(frequencies))

 # save the model to local disk

joblib.dump(best_mnb, 'mnb_category_classifier.joblib')

Accuracy:0.9654471544715447

The distribution of predicted result of the best model:[[  0 127]
 [  1  50]
 [  2  17]
 [  3  38]
 [  4 260]]


['mnb_category_classifier.joblib']