In [36]:
import pandas as pd
import math
from itertools import tee

Import the new CSV file

In [37]:
df = pd.read_csv("rockyou_dataset.csv", encoding="utf-8",keep_default_na=False)

In [38]:
print(df.head())

    password
0     123456
1      12345
2  123456789
3   password
4   iloveyou


In [39]:
df["password"] = df["password"].fillna("").astype(str)
df = df[df["password"].str.len() > 0].reset_index(drop=True)

In [40]:
print(df.head())

    password
0     123456
1      12345
2  123456789
3   password
4   iloveyou


Extract charactaristic features

num_of_features

In [None]:
# total length
df["length"] = df["password"].astype(str).str.len()
#num of letters
df["num_letters"] = df["password"].str.count(r"[A-Za-z]")
df["num_upper"] = df["password"].str.count(r"[A-Z]")
df["num_lower"] = df["password"].str.count(r"[a-z]")
#num of digits 
df["num_digits"] = df["password"].str.count(r"[0-9]")
#num of special characters
df["num_special_char"] = df["password"].str.count(r"[^A-Za-z0-9]")

In [43]:
print(df.head())

    password  length  num_letters  num_upper  num_lower  num_digits  \
0     123456       6            0          0          0           6   
1      12345       5            0          0          0           5   
2  123456789       9            0          0          0           9   
3   password       8            8          0          8           0   
4   iloveyou       8            8          0          8           0   

   num_special_char  
0                 0  
1                 0  
2                 0  
3                 0  
4                 0  


Boolean_features

In [45]:
#general
df["has_upper"] = df["num_upper"] > 0
df["has_num"] = df["num_digits"] > 0
df["has_special"] = df["num_special_char"] > 0
# First character
df["first_is_upper"] = df["password"].str.match(r"^[A-Z]")
df["first_is_digit"] = df["password"].str.match(r"^[0-9]")
df["first_is_special"] = df["password"].str.match(r"^[^A-Za-z0-9]")
# Last character
df["last_is_upper"] = df["password"].str[-1].str.match(r"[A-Z]")
df["last_is_digit"] = df["password"].str[-1].str.match(r"[0-9]")
df["last_is_special"] = df["password"].str[-1].str.match(r"[^A-Za-z0-9]")

In [46]:
# Convert all boolean columns to 0/1
bool_cols = ['has_upper', 'has_num', 'has_special', 
             'first_is_upper', 'first_is_digit',
             'first_is_special', 'last_is_upper', 'last_is_digit', 'last_is_special']

df[bool_cols] = df[bool_cols].astype(int)

In [47]:
print(df.head())

    password  length  num_letters  num_upper  num_lower  num_digits  \
0     123456       6            0          0          0           6   
1      12345       5            0          0          0           5   
2  123456789       9            0          0          0           9   
3   password       8            8          0          8           0   
4   iloveyou       8            8          0          8           0   

   num_special_char  has_upper  has_num  has_special  first_is_upper  \
0                 0          0        1            0               0   
1                 0          0        1            0               0   
2                 0          0        1            0               0   
3                 0          0        0            0               0   
4                 0          0        0            0               0   

   first_is_digit  first_is_special  last_is_upper  last_is_digit  \
0               1                 0              0              1   
1 

ratio_features

In [48]:
df["ratio_letters"] = df["num_letters"] / df["length"]
df["ratio_uppercase"] = df["num_upper"] / df["length"]
df["ratio_lowercase"] = df["num_lower"] / df["length"]

df["ratio_digits"] = df["num_digits"] / df["length"]

df["ratio_symbols"] = df["num_special_char"] / df["length"]

In [49]:
print(df.head())

    password  length  num_letters  num_upper  num_lower  num_digits  \
0     123456       6            0          0          0           6   
1      12345       5            0          0          0           5   
2  123456789       9            0          0          0           9   
3   password       8            8          0          8           0   
4   iloveyou       8            8          0          8           0   

   num_special_char  has_upper  has_num  has_special  ...  first_is_digit  \
0                 0          0        1            0  ...               1   
1                 0          0        1            0  ...               1   
2                 0          0        1            0  ...               1   
3                 0          0        0            0  ...               0   
4                 0          0        0            0  ...               0   

   first_is_special  last_is_upper  last_is_digit  last_is_special  \
0                 0              0      

entropy_features

functions

In [50]:
def shannon_entropy(pwd):
    if not pwd:
        return 0
    freq = {c: pwd.count(c)/len(pwd) for c in set(pwd)}
    return -sum(p * math.log2(p) for p in freq.values())

def ngrams(seq, n=2):
    a, b = tee(seq)
    for _ in range(1, n):
        b = tee(b)[1]
        next(b, None)
    return zip(*a, *b)

def bigram_entropy(pwd):
    pwd = str(pwd)
    if len(pwd) < 2:
        return 0
    bigrams = list(zip(pwd, pwd[1:]))
    freq = {bg: bigrams.count(bg)/len(bigrams) for bg in set(bigrams)}
    return -sum(p * math.log2(p) for p in freq.values())

sequences = ["abcdefghijklmnopqrstuvwxyz", "0123456789", "qwertyuiop", "asdfghjkl", "zxcvbnm"]
def pattern_entropy(pwd):
    pwd = pwd.lower()
    penalty = 0
    for seq in sequences:
        for i in range(len(seq)-2):
            pattern = seq[i:i+3]
            if pattern in pwd:
                penalty += 1
    return max(0, shannon_entropy(pwd) - penalty*0.2)

keyboard_sequences = ["qwertyuiop", "asdfghjkl", "zxcvbnm"]
def keyboard_entropy(pwd):
    pwd = pwd.lower()
    penalty = 0
    for seq in keyboard_sequences:
        for i in range(len(seq)-2):
            if seq[i:i+3] in pwd:
                penalty += 1
    return max(0, shannon_entropy(pwd) - penalty*0.2)

features

In [51]:
df["shannon_entropy"] = df["password"].apply(shannon_entropy)
df["length_adjusted_entropy"] = df["shannon_entropy"] * df["password"].str.len()
df["bigram_entropy"] = df["password"].apply(bigram_entropy)
df["pattern_entropy"] = df["password"].apply(pattern_entropy)
df["keyboard_entropy"] = df["password"].apply(keyboard_entropy)

In [52]:
print(df.head())

    password  length  num_letters  num_upper  num_lower  num_digits  \
0     123456       6            0          0          0           6   
1      12345       5            0          0          0           5   
2  123456789       9            0          0          0           9   
3   password       8            8          0          8           0   
4   iloveyou       8            8          0          8           0   

   num_special_char  has_upper  has_num  has_special  ...  ratio_letters  \
0                 0          0        1            0  ...            0.0   
1                 0          0        1            0  ...            0.0   
2                 0          0        1            0  ...            0.0   
3                 0          0        0            0  ...            1.0   
4                 0          0        0            0  ...            1.0   

   ratio_uppercase  ratio_lowercase  ratio_digits  ratio_symbols  \
0              0.0              0.0           1.

PCA combination

In [53]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

ent_cols = [
    "shannon_entropy",
    "length_adjusted_entropy",
    "bigram_entropy",
    "pattern_entropy",
    "keyboard_entropy",
]

#check all num
mask = df[ent_cols].notna().all(axis=1)
X = df.loc[mask, ent_cols].astype(float)

#standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#PCA
pca = PCA(n_components=1)
pc1 = pca.fit_transform(X_scaled)  # shape (n_rows, 1)

#add new column

df.loc[mask, "combined_entropy_pca"] = pc1.ravel()
df.loc[~mask, "combined_entropy_pca"] = np.nan

#normaliz
df["combined_entropy_pca_norm"] = (df["combined_entropy_pca"] - df["combined_entropy_pca"].min()) / (
    df["combined_entropy_pca"].max() - df["combined_entropy_pca"].min()
)

In [54]:
print(df.head())

    password  length  num_letters  num_upper  num_lower  num_digits  \
0     123456       6            0          0          0           6   
1      12345       5            0          0          0           5   
2  123456789       9            0          0          0           9   
3   password       8            8          0          8           0   
4   iloveyou       8            8          0          8           0   

   num_special_char  has_upper  has_num  has_special  ...  ratio_lowercase  \
0                 0          0        1            0  ...              0.0   
1                 0          0        1            0  ...              0.0   
2                 0          0        1            0  ...              0.0   
3                 0          0        0            0  ...              1.0   
4                 0          0        0            0  ...              1.0   

   ratio_digits  ratio_symbols  shannon_entropy  length_adjusted_entropy  \
0           1.0            0