<a href="https://colab.research.google.com/github/RayThibodeaux/password-strength/blob/main/Password_Project_Count_Vector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
#Dataframes, manipulation, and widgets
import pandas as pd
import numpy as np
import plotly.express as px
import ipywidgets as widgets
import time
from ipywidgets import interact, interactive, fixed, interact_manual
from ipywidgets import IntProgress

#Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

#Train test split
from sklearn.model_selection import train_test_split

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

#Evaluate
from sklearn.metrics import confusion_matrix,accuracy_score

In [27]:
#Creating a dataframe with the password data
passwords = pd.read_csv('passwords.csv',on_bad_lines='skip')
passwords

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1
...,...,...
669635,10redtux10,1
669636,infrared1,1
669637,184520socram,1
669638,marken22a,1


In [28]:
#View na values in data
passwords.isna().sum()

password    1
strength    0
dtype: int64

In [29]:
#View na values in data
passwords[passwords['password'].isna()]

Unnamed: 0,password,strength
367579,,0


In [30]:
#Remove na value since it's only one value out of 669k
passwords = passwords.dropna()

In [31]:
#Passsword strength distribution - Normal and Discrete
fig = px.histogram(passwords, x="strength",nbins=5,text_auto=True)
fig.update_layout(bargap=0.5)
fig.show()

In [32]:
#Creating an array of data - we need an array because it is more efficient
password_a = np.array(passwords)

#Password array
indep_x = [i[0] for i in password_a]
indep_x = np.array(indep_x)

#Strength array
depend_y = [i[1] for i in password_a]
depend_y = np.array(depend_y)

In [33]:
#Function to split each word to character
def split_text(dataset):
    character=[]
    for i in dataset:
        character.append(i)
    return character

In [34]:
#Tokenize each character and transform to vector - count
count_vector = CountVectorizer(tokenizer=split_text)
count_word_vector = count_vector.fit_transform(indep_x)

In [35]:
#View count vector
count_view = count_word_vector[0]
count_df = pd.DataFrame(count_view.T.todense(), index=count_vector.get_feature_names_out(), columns=['Count'])
count_df.sort_values(by=['Count'], ascending=False)

Unnamed: 0,Count
5,2
7,2
z,1
k,1
d,1
...,...
\,0
],0
^,0
_,0


In [36]:
#Splitting dataset into train test split for count - %80 train, %20 test
X_train, X_test, y_train, y_test = train_test_split(count_word_vector, depend_y, test_size=0.2)

In [37]:
#Creating model parameters
log_reg_params = [{"max_iter": 1000}]
bn_naive_bayes_params = [{}]
dec_tree_params = [{}]
rand_for_params = [{}]
kneighbors_params = [{}]

In [38]:
#Creating list of models
model_list = [
    ["Logistic Regression", LogisticRegression, log_reg_params],
    ["Bernouli Naive Bayes", BernoulliNB, bn_naive_bayes_params],
    ["Decision Tree", DecisionTreeClassifier, dec_tree_params],
    ["Random Forest", RandomForestClassifier, rand_for_params],
    ["K-NN", KNeighborsClassifier, kneighbors_params],
]

In [39]:
#Iterate through each model
overview = []
for mn, m, p_list in model_list:
    for p in p_list:
        model = m(**p)
        model.fit(X_train,y_train)
        score = model.score(X_test,y_test)
        overview.append((mn,m,p,score))

In [40]:
#View scores of each model
overview.sort(key=lambda x:x[-1], reverse=True)
for mn, m, p, score in overview:
    print(mn, p, score)

Logistic Regression {'max_iter': 1000} 0.9997909324413118
Random Forest {} 0.8916731378053879
Bernouli Naive Bayes {} 0.8131981363120483
Decision Tree {} 0.8058957051550086
K-NN {} 0.7648288632698166


In [41]:
#Logistic Regression Model - Count Vector
log_model = LogisticRegression(max_iter=10000)
log_model.fit(X_train,y_train)

LogisticRegression(max_iter=10000)

In [48]:
#Interactive password input that predicts password strength
@interact
def interact_pass(Password=''):
    X_manual = count_vector.transform([Password])
    pr = log_model.predict(X_manual)
    prog = widgets.IntProgress(min=0,max=2,style={'bar_color': 'lightblue'})
    prog.value = pr[0]
    display(prog)

interactive(children=(Text(value='', description='Password'), Output()), _dom_classes=('widget-interact',))