<a href="https://colab.research.google.com/github/RayThibodeaux/password-strength/blob/main/Password_Project_TF_IDF_Vector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
#Dataframes, manipulation, and widgets
import pandas as pd
import numpy as np
import plotly.express as px
import ipywidgets as widgets
from ipywidgets import IntProgress
from ipywidgets import interact, interactive, fixed, interact_manual

#Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#Train test split
from sklearn.model_selection import train_test_split

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

#Evaluate
from sklearn.metrics import confusion_matrix,accuracy_score

In [38]:
#Creating a dataframe with the password data
passwords = pd.read_csv('passwords.csv',on_bad_lines='skip')
passwords

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1
...,...,...
669635,10redtux10,1
669636,infrared1,1
669637,184520socram,1
669638,marken22a,1


In [39]:
#View na values in data
passwords.isna().sum()

password    1
strength    0
dtype: int64

In [40]:
#View na values in data
passwords[passwords['password'].isna()]

Unnamed: 0,password,strength
367579,,0


In [41]:
#Remove na value since it's only one value out of 669k
passwords = passwords.dropna()

In [42]:
#Passsword strength distribution - Normal and Discrete
fig = px.histogram(passwords, x="strength",nbins=5,text_auto=True)
fig.update_layout(bargap=0.5)
fig.show()

In [43]:
#Creating an array of data - we need an array because it is more efficient
password_a = np.array(passwords)

#Password array
indep_x = [i[0] for i in password_a]
indep_x = np.array(indep_x)

#Strength array
depend_y = [i[1] for i in password_a]
depend_y = np.array(depend_y)

In [44]:
#Function to split each word to character
def split_text(dataset):
    character=[]
    for i in dataset:
        character.append(i)
    return character

In [45]:
#Tokenize each character and transform to vector - tfidf
tfidf_vector = TfidfVectorizer(tokenizer=split_text)
tfidf_word_vector = tfidf_vector.fit_transform(indep_x)

In [46]:
#View tfidf vector
tfidf_view = tfidf_word_vector[0]
tfidf_df = pd.DataFrame(tfidf_view.T.todense(), index=tfidf_vector.get_feature_names_out(), columns=['TF-IDF'])
tfidf_df.sort_values(by=['TF-IDF'], ascending=False)

Unnamed: 0,TF-IDF
7,0.591303
5,0.566899
z,0.335926
k,0.292247
d,0.285631
...,...
\,0.000000
],0.000000
^,0.000000
_,0.000000


In [47]:
#Splitting dataset into train test split for tfidf - %80 train, %20 test
X_train, X_test, y_train, y_test = train_test_split(tfidf_word_vector, depend_y, test_size=0.2)

In [48]:
#Creating model parameters
log_reg_params = [{"max_iter": 1000}]
bn_naive_bayes_params = [{}]
dec_tree_params = [{}]
rand_for_params = [{}]
kneighbors_params = [{}]

In [49]:
#Creating list of models
model_list = [
    ["Logistic Regression", LogisticRegression, log_reg_params],
    ["Bernouli Naive Bayes", BernoulliNB, bn_naive_bayes_params],
    ["Decision Tree", DecisionTreeClassifier, dec_tree_params],
    ["Random Forest", RandomForestClassifier, rand_for_params],
    ["K-NN", KNeighborsClassifier, kneighbors_params],
]

In [50]:
#Iterate through each model
overview = []
for mn, m, p_list in model_list:
    for p in p_list:
        model = m(**p)
        model.fit(X_train,y_train)
        score = model.score(X_test,y_test)
        overview.append((mn,m,p,score))

In [51]:
#View scores of each model
overview.sort(key=lambda x:x[-1], reverse=True)
for mn, m, p, score in overview:
    print(mn, p, score)

Random Forest {} 0.9566782151603846
Decision Tree {} 0.9285362284212413
Logistic Regression {'max_iter': 1000} 0.8184920255659758
Bernouli Naive Bayes {} 0.810532524938773
K-NN {} 0.7775371841586524


In [52]:
#Logistic Regression Model - Count Vector
ran_model = RandomForestClassifier()
ran_model.fit(X_train,y_train)

RandomForestClassifier()

In [55]:
#Interactive password input that predicts password strength
@interact
def interact_pass(Password=''):
    X_manual = tfidf_vector.transform([Password])
    pr = ran_model.predict(X_manual)
    prog = widgets.IntProgress(min=0,max=2,style={'bar_color': 'lightblue'})
    prog.value = pr[0]
    display(prog)

interactive(children=(Text(value='', description='Password'), Output()), _dom_classes=('widget-interact',))