In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score, confusion_matrix as cm, log_loss as l_l, precision_score, roc_auc_score

In [2]:
pas = pd.read_csv('new_passwords.csv')

In [3]:
pas.drop('password', axis = 1, inplace = True)

In [4]:
X = pas.values[:, 1:]
y = pas.values[:, 0]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [6]:
GNB = GaussianNB()
MNB = MultinomialNB()
LR = LogisticRegression(multi_class = 'multinomial')
DTC = DecisionTreeClassifier(max_depth = 3)
L_SVC = LinearSVC()
SGD = SGDClassifier()

In [7]:
algos = [GNB,
         MNB,
         LR, 
         DTC,
         L_SVC,
         SGD]

In [10]:
def preds(alg, x_train=X_train, 
          y_train=y_train, 
          test_x=X_test,
          test_y=y_test):
    
    alg.fit(x_train, y_train)
    preds = alg.predict(test_x)
    
    results = {'Algorithm': alg, 
               'F1_score(macro)': f1_score(test_y, preds, average='macro'),
               'Precision_score(macro)': precision_score(test_y, preds, average='macro'),
               'F1_score(micro)': f1_score(test_y, preds, average='micro'),
               'Precision_score(micro)': precision_score(test_y, preds, average='micro'),
               'Confusion_matrix': cm(test_y, preds)}
    
    #confusion_matrix = cm(test_y, preds)
    
    return results #confusion_matrix
    

In [11]:
for alg in algos:
    print(preds(alg))

{'Algorithm': GaussianNB(), 'F1_score(macro)': 0.5438825920775927, 'Precision_score(macro)': 0.6045269128927915, 'F1_score(micro)': 0.6474528532941465, 'Precision_score(micro)': 0.6474528532941465, 'Confusion_matrix': array([[  167, 25891,   699],
       [   53, 29056,   742],
       [  253,  1151, 23648]], dtype=int64)}
{'Algorithm': MultinomialNB(), 'F1_score(macro)': 0.6657824176625943, 'Precision_score(macro)': 0.6783886767582671, 'F1_score(micro)': 0.6519593436198874, 'Precision_score(micro)': 0.6519593436198874, 'Confusion_matrix': array([[14471, 11699,   587],
       [11047, 18272,   532],
       [ 2680,  1876, 20496]], dtype=int64)}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Algorithm': LogisticRegression(multi_class='multinomial'), 'F1_score(macro)': 0.9990581736399219, 'Precision_score(macro)': 0.9990243276369055, 'F1_score(micro)': 0.9990448199853049, 'Precision_score(micro)': 0.9990448199853049, 'Confusion_matrix': array([[26757,     0,     0],
       [   30, 29790,    31],
       [    0,    17, 25035]], dtype=int64)}


  _warn_prf(average, modifier, msg_start, len(result))


{'Algorithm': DecisionTreeClassifier(max_depth=3), 'F1_score(macro)': 0.49181671423986834, 'Precision_score(macro)': 0.4811587099231088, 'F1_score(micro)': 0.587374479549351, 'Precision_score(micro)': 0.587374479549351, 'Confusion_matrix': array([[    0, 26518,   239],
       [    0, 29564,   287],
       [    0,  6651, 18401]], dtype=int64)}




{'Algorithm': LinearSVC(), 'F1_score(macro)': 0.9943418428120542, 'Precision_score(macro)': 0.9942202342661918, 'F1_score(micro)': 0.9942566740142053, 'Precision_score(micro)': 0.9942566740142053, 'Confusion_matrix': array([[26757,     0,     0],
       [  200, 29514,   137],
       [    0,   132, 24920]], dtype=int64)}
{'Algorithm': SGDClassifier(), 'F1_score(macro)': 0.9874537570275637, 'Precision_score(macro)': 0.9873676200276796, 'F1_score(micro)': 0.9872275287778595, 'Precision_score(micro)': 0.9872275287778595, 'Confusion_matrix': array([[26757,     0,     0],
       [  544, 29141,   166],
       [    1,   332, 24719]], dtype=int64)}


In [64]:
import joblib
joblib.dump(LR, 'LogisticRegression_model.jl')

['LogisticRegression_model.jl']