In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [16]:
df = pd.read_csv("passwords_dataset.csv",on_bad_lines='skip')
df.head(10)

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1
5,AVYq1lDE4MgAZfNt,2
6,u6c8vhow,1
7,v1118714,1
8,universe2908,1
9,as326159,1


In [17]:
##Check if the datase has null values
df.isna().sum()

Unnamed: 0,0
password,1
strength,0


In [18]:
## Delete the null values
df_psw = df.dropna().copy()

In [19]:
## Verify the dataset is free of null values
df_psw.isna().sum()

Unnamed: 0,0
password,0
strength,0


In [20]:
## Check if the data set has duplicate values
df_psw.duplicated().sum()

np.int64(0)

In [21]:
## Mapping the strength column to weak,medium,strong values
df_psw['strength'] = df_psw['strength'].map({0:'Weak',
                                         1:'Medium',
                                         2:'Strong'})


In [22]:
df_psw.head()

Unnamed: 0,password,strength
0,kzde5577,Medium
1,kino3434,Medium
2,visi7k1yr,Medium
3,megzy123,Medium
4,lamborghin1,Medium


In [23]:
## Create a function to tokenize the password
def word(password):
  return [char for char in password]

In [24]:
## create the arrays
X = np.array(df_psw['password'])
y = np.array(df_psw['strength'])

In [25]:
## create the object tdif and fit-transform X
tdif = TfidfVectorizer(tokenizer=word)
X = tdif.fit_transform(X)



In [28]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [34]:
## Training of the model to predict the strength of the password
rf = RandomForestClassifier(n_estimators=150,random_state=42)
rf.fit(X_train,y_train)
print(rf.score(X_test,y_test))

0.9564019473149752


In [60]:
random_psw = ["1234","F3ynM4n725195_","D1m4nT3Hola"]

In [61]:
## Checking the performance of the mocel using 3 random passwords
for psw in random_psw:
  new_psw = tdif.transform([psw]).toarray()
  print(rf.predict(new_psw),psw)

['Weak'] 1234
['Strong'] F3ynM4n725195_
['Medium'] D1m4nT3Hola
