<a href="https://colab.research.google.com/github/RayThibodeaux/password-strength/blob/main/Password_Project_Count_Vector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Dataframes, manipulation, and widgets
import time
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import ipywidgets as widgets
from ipywidgets import IntProgress
from ipywidgets import interact, interactive, fixed, interact_manual

#Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

#Balance data
from sklearn.utils import class_weight

#Train test split
from sklearn.model_selection import train_test_split

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

#Cross Validation
from sklearn.model_selection import cross_val_score

#Evaluate
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.model_selection import GridSearchCV

In [None]:
#Creating a dataframe with the password data
passwords = pd.read_csv('passwords.csv',on_bad_lines='skip')
passwords

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1
...,...,...
669635,10redtux10,1
669636,infrared1,1
669637,184520socram,1
669638,marken22a,1


In [None]:
#View na values in data
passwords.isna().sum()

password    1
strength    0
dtype: int64

In [None]:
#View na values in data
passwords[passwords['password'].isna()]

Unnamed: 0,password,strength
367579,,0


In [None]:
#Remove na value since it's only one value out of 669k
passwords = passwords.dropna()

In [None]:
#Passsword strength distribution - Normal and Discrete
fig = px.histogram(passwords, x="strength",nbins=5,text_auto=True)
fig.update_layout(bargap=0.5)
fig.show()

In [None]:
#Creating an array of data - we need an array because it is more efficient
password_a = np.array(passwords)

#Password array
indep_x = [i[0] for i in password_a]
indep_x = np.array(indep_x)

#Strength array
depend_y = [i[1] for i in password_a]
depend_y = np.array(depend_y)

In [None]:
#Function to split each word to character
def split_text(dataset):
    character=[]
    for i in dataset:
        character.append(i)
    return character

In [None]:
#Tokenize each character and transform to vector - count
count_vector = CountVectorizer(tokenizer=split_text)
count_word_vector = count_vector.fit_transform(indep_x)

In [None]:
#View count vector
count_view = count_word_vector[0]
count_df = pd.DataFrame(count_view.T.todense(), index=count_vector.get_feature_names_out(), columns=['Count'])
count_df.sort_values(by=['Count'], ascending=False)

Unnamed: 0,Count
5,2
7,2
z,1
k,1
d,1
...,...
\,0
],0
^,0
_,0


In [None]:
#Splitting dataset into train test split for count - %80 train, %20 test
X_train, X_test, y_train, y_test = train_test_split(count_word_vector, depend_y, test_size=0.2)

In [None]:
#View the balanced weights for each class
c_weights = class_weight.compute_class_weight('balanced', classes=np.unique(depend_y),y=depend_y)
print(c_weights)

[2.4884115  0.44930063 2.68488158]


In [None]:
#Creating model parameters
log_reg_params = [{'class_weight': 'balanced',"max_iter": 10000}]
dec_tree_params = [{'class_weight': 'balanced'}]
rand_for_params = [{'class_weight': 'balanced'}]
l_svc_params = [{'class_weight': 'balanced'}]

In [None]:
#Creating list of models
model_list = [
    ["Logistic Regression", LogisticRegression, log_reg_params],
    ["Decision Tree", DecisionTreeClassifier, dec_tree_params],
    ["Random Forest", RandomForestClassifier, rand_for_params],
    ["SVC",LinearSVC, l_svc_params]
]

In [None]:
#Iterate through each model to train,test,and score -Count Vector - No Cross Validation
overview = []
for mn, m, p_list in model_list:
    for p in p_list:
        model = m(**p)
        model.fit(X_train,y_train)
        score = model.score(X_test,y_test)
        overview.append((mn,m,p,score))


Liblinear failed to converge, increase the number of iterations.



In [None]:
#View scores of each model and sort by most accurate
overview.sort(key=lambda x:x[-1], reverse=True)
for mn, m, p, score in overview:
    print(mn, p, score)

Logistic Regression {'class_weight': 'balanced', 'max_iter': 10000} 0.9997610656472135
SVC {'class_weight': 'balanced'} 0.9988874619198376
Random Forest {'class_weight': 'balanced'} 0.8846693745893316
Decision Tree {'class_weight': 'balanced'} 0.8137581387013918


In [None]:
#Iterate through each model to train,test,and score - Count Vector - Cross Validation
overview = []
for mn, m, p_list in model_list:
    for p in p_list:
        model = m(**p)
        cross_score = cross_val_score(model,count_word_vector,depend_y,cv=5,scoring='accuracy').mean()
        overview.append((mn,m,p,cross_score))

KeyboardInterrupt: ignored

In [None]:
#View scores of each model and sort by most accurate -Count Vector -  Cross Validation
overview.sort(key=lambda x:x[-1], reverse=True)
for mn, m, p, cross_score in overview:
    print(mn, p, cross_score)

Logistic Regression {'class_weight': 'balanced', 'max_iter': 10000} 0.999719251890167
Decision Tree {'class_weight': 'balanced'} 0.8145329040176051


In [None]:
log_model = LogisticRegression()

param_grid = {'penalty': ['l1', 'l2'],
    'solver': ['saga', 'sag'],
    'max_iter' :[10000]}

grid = GridSearchCV(estimator=log_model,param_grid=param_grid,scoring='accuracy')
g_results = grid.fit(X_train,y_train)

print("Highest Score: ", g_results.best_score_)
print("Highest Scoring Parameters: ", g_results.best_params_)


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge


The max_iter was reached which means the coef_ did not converge



In [None]:
#Logistic Regression Model - Count Vector
log_model = LogisticRegression(class_weight='balanced',max_iter=10000,penalty='l1',solver='saga')
log_model.fit(X_train,y_train)

LogisticRegression(class_weight='balanced', max_iter=10000, penalty='l1',
                   solver='saga')

In [None]:
#Score the model
log_model_acc = log_model.score(X_test,y_test)
print(log_model_acc)

0.9966623857595126


In [None]:
#Interactive password input that predicts password strength
#Widgets
password_text = widgets.Text(
    placeholder='Enter Password',
    description='Password:',
    disabled=False
)
button = widgets.Button(description='Submit Password')
out = widgets.Output()

In [None]:
#Button function
def on_button_clicked(_):
      with out:
          #Button action
          out.clear_output()
          X_manual = count_vector.transform([password_text.value])
          pr = log_model.predict(X_manual)
          #Progress bar
          prog = widgets.IntProgress(min=0,max=2,style={'bar_color': 'lightblue'})
          prog.value = pr[0]
          display(prog)
          #Simple Bruteforce and Dictionary attack prevention
          button.disabled = True
          time.sleep(3)
          button.disabled = False

#Linking button and function
button.on_click(on_button_clicked)
# Display widgets
widgets.VBox([password_text,button,out])