In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
#import necessary dependencies

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier

In [3]:
password_checker = pd.read_csv("/content/data.csv.zip", on_bad_lines='skip')
print(password_checker.sample(15))

                  password  strength
306464            pasming3         1
501240           w2243s123         1
595490           brdadmin2         1
21741           thjckdb743         1
147687             qwerty3         0
464167     N3E4dYJ4U2mAEKk         2
90531     j0cMXmDA4MgvPY9V         2
488505  taiwo@gbadegeso@10         2
247770           5856797za         1
592464            olevo335         1
663395          arcangel18         1
645877          26106jochy         1
491521           ali512360         1
276839         wkdablpcs43         1
207046             hilari0         0


In [4]:
for column in password_checker.columns:
    print(f"Column '{column}': {password_checker[column].nunique()} unique values")

Column 'password': 669639 unique values
Column 'strength': 3 unique values


The dataset contains two columns: password and strength. In the strength column:

0 indicates a weak password

1 indicates a medium-strength password

2 indicates a strong password

It also has 669639 password samples

In [5]:
password_checker.isnull().sum()

Unnamed: 0,0
password,1
strength,0


In [6]:
print({password_checker.shape[0]})

{669640}


In [7]:
# Drop the row that is empty
password_checker.dropna(subset=['password'], inplace=True)

# To count the characters for each password
password_checker["password"] = password_checker["password"].apply(lambda x: x.strip())
password_checker["password"] = password_checker["password"].apply(lambda x: x.replace(" ", ""))
password_checker["characters"] = password_checker["password"].apply(len)
password_checker.head()

Unnamed: 0,password,strength,characters
0,kzde5577,1,8
1,kino3434,1,8
2,visi7k1yr,1,9
3,megzy123,1,8
4,lamborghin1,1,11


In [8]:
print({password_checker.shape[0]})

{669639}


In [9]:
# Tokenize the dataset before model training so the model can learn from the combinations of digits, letters, and symbols
def word(password):
    character_array = []
    for i in password:
        character_array.append(i)
    return character_array

In [10]:
# Model training

# Define feature and target
X = np.array(password_checker["password"])
y = np.array(password_checker["strength"])

vectorizer = TfidfVectorizer(tokenizer=word)
X = vectorizer.fit_transform(X)

# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [12]:
# Train multiple models to predict password strength

classifiers = {"xgb" : XGBClassifier(),
"mlp" : MLPClassifier(),
"cat" : CatBoostClassifier(),
"rfc" : RandomForestClassifier(),
"dtc" : DecisionTreeClassifier()}

In [13]:
# Determine the optimal model

from sklearn.metrics import accuracy_score, precision_score

accuracy= []
precision= []
for classifier in classifiers.values():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy.append(accuracy_score(y_pred, y_test))
    precision.append(precision_score(y_pred, y_test, average='macro'))



Learning rate set to 0.109018
0:	learn: 0.9947955	total: 890ms	remaining: 14m 49s
1:	learn: 0.9159591	total: 1.42s	remaining: 11m 50s
2:	learn: 0.8561764	total: 1.89s	remaining: 10m 26s
3:	learn: 0.8056954	total: 2.4s	remaining: 9m 58s
4:	learn: 0.7637912	total: 2.94s	remaining: 9m 44s
5:	learn: 0.7298474	total: 3.48s	remaining: 9m 35s
6:	learn: 0.6971999	total: 4.29s	remaining: 10m 9s
7:	learn: 0.6724730	total: 5.35s	remaining: 11m 3s
8:	learn: 0.6501887	total: 6.15s	remaining: 11m 17s
9:	learn: 0.6287470	total: 6.94s	remaining: 11m 27s
10:	learn: 0.6089694	total: 7.43s	remaining: 11m 8s
11:	learn: 0.5886738	total: 8.04s	remaining: 11m 2s
12:	learn: 0.5738624	total: 8.62s	remaining: 10m 54s
13:	learn: 0.5569745	total: 9.13s	remaining: 10m 43s
14:	learn: 0.5424063	total: 9.66s	remaining: 10m 34s
15:	learn: 0.5303366	total: 10.1s	remaining: 10m 21s
16:	learn: 0.5167810	total: 10.7s	remaining: 10m 18s
17:	learn: 0.5057845	total: 11.3s	remaining: 10m 19s
18:	learn: 0.4945938	total: 11.8s	

In [14]:
model_results = pd.DataFrame({"Models":classifiers.keys(), "accuracy_score":accuracy, "precision_score":precision})
model_results


Unnamed: 0,Models,accuracy_score,precision_score
0,xgb,0.981139,0.965408
1,mlp,0.985425,0.979348
2,cat,0.977525,0.960547
3,rfc,0.956656,0.907739
4,dtc,0.926961,0.885919


MLP classifier was the best model in accuracy and precision, so we will be using it.


In [15]:
# testing the MLPclassifier
import getpass
user = getpass.getpass("Enter Password: ")
data = vectorizer.transform([user]).toarray()

# Select the MLPClassifier from the dictionary of classifiers
mlp_classifier_model = classifiers['mlp']
output = mlp_classifier_model.predict(data)

# Map the numeric output to descriptive labels
strength_mapping = {0: "Weak", 1: "Medium", 2: "Strong"}
predicted_strength = strength_mapping[output[0]]

print(f"This password is {predicted_strength}")

Enter Password: ··········
This password is Weak


In [16]:
import joblib

# Save the MLPClassifier model
joblib.dump(mlp_classifier_model, 'mlp_classifier_model.joblib')
print("MLPClassifier model saved as 'mlp_classifier_model.joblib'")

# Save the TfidfVectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
print("TfidfVectorizer saved as 'tfidf_vectorizer.joblib'")

MLPClassifier model saved as 'mlp_classifier_model.joblib'
TfidfVectorizer saved as 'tfidf_vectorizer.joblib'


In [17]:

# test saved model and vectorizer
import joblib

model = joblib.load("/content/mlp_classifier_model.joblib")
vectorizer = joblib.load("/content/tfidf_vectorizer.joblib")

X = vectorizer.transform(["P@ssw0rd123"])
prediction = model.predict(X)

# Map the numeric output to descriptive labels
strength_mapping = {0: "Weak", 1: "Medium", 2: "Strong"}
predicted_strength_loaded = strength_mapping[prediction[0]]

print(f"This password is {predicted_strength_loaded}")

This password is Strong
