In [1]:
import pandas as pd
import math
import numpy as np
from itertools import tee

Import the new CSV file

In [2]:
df = pd.read_csv("E:/New SW PR/backend/Ignored datasets/rockyou_dataset_cleaned.csv", encoding="utf-8",keep_default_na=False)

In [3]:
df.head()

Unnamed: 0,password
0,123456
1,12345
2,123456789
3,password
4,iloveyou


In [4]:
df["password"] = df["password"].fillna("").astype(str)
df = df[df["password"].str.len() > 0].reset_index(drop=True)

In [5]:
df.head()

Unnamed: 0,password
0,123456
1,12345
2,123456789
3,password
4,iloveyou


Extract charactaristic features

num_of_features

In [6]:
# total length
df["length"] = df["password"].astype(str).str.len()
#num of letters
df["num_letters"] = df["password"].str.count(r"[A-Za-z]")
df["num_upper"] = df["password"].str.count(r"[A-Z]")
df["num_lower"] = df["password"].str.count(r"[a-z]")
#num of digits 
df["num_digits"] = df["password"].str.count(r"[0-9]")
#num of special characters
df["num_special_char"] = df["password"].str.count(r"[^A-Za-z0-9]")

In [7]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char
0,123456,6,0,0,0,6,0
1,12345,5,0,0,0,5,0
2,123456789,9,0,0,0,9,0
3,password,8,8,0,8,0,0
4,iloveyou,8,8,0,8,0,0


Boolean_features

In [8]:
#general
df["has_upper"] = df["num_upper"] > 0
df["has_num"] = df["num_digits"] > 0
df["has_special"] = df["num_special_char"] > 0
# First character
df["first_is_upper"] = df["password"].str.match(r"^[A-Z]")
df["first_is_digit"] = df["password"].str.match(r"^[0-9]")
df["first_is_special"] = df["password"].str.match(r"^[^A-Za-z0-9]")
# Last character
df["last_is_upper"] = df["password"].str[-1].str.match(r"[A-Z]")
df["last_is_digit"] = df["password"].str[-1].str.match(r"[0-9]")
df["last_is_special"] = df["password"].str[-1].str.match(r"[^A-Za-z0-9]")

In [9]:
# Convert all boolean columns to 0/1
bool_cols = ['has_upper', 'has_num', 'has_special', 
             'first_is_upper', 'first_is_digit',
             'first_is_special', 'last_is_upper', 'last_is_digit', 'last_is_special']

df[bool_cols] = df[bool_cols].astype(int)

In [10]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,first_is_upper,first_is_digit,first_is_special,last_is_upper,last_is_digit,last_is_special
0,123456,6,0,0,0,6,0,0,1,0,0,1,0,0,1,0
1,12345,5,0,0,0,5,0,0,1,0,0,1,0,0,1,0
2,123456789,9,0,0,0,9,0,0,1,0,0,1,0,0,1,0
3,password,8,8,0,8,0,0,0,0,0,0,0,0,0,0,0
4,iloveyou,8,8,0,8,0,0,0,0,0,0,0,0,0,0,0


ratio_features

In [11]:
#letters
df["ratio_letters"] = df["num_letters"] / df["length"]
df["ratio_uppercase"] = df["num_upper"] / df["length"]
df["ratio_lowercase"] = df["num_lower"] / df["length"]
#digits
df["ratio_digits"] = df["num_digits"] / df["length"]
#special
df["ratio_symbols"] = df["num_special_char"] / df["length"]

In [12]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,first_is_digit,first_is_special,last_is_upper,last_is_digit,last_is_special,ratio_letters,ratio_uppercase,ratio_lowercase,ratio_digits,ratio_symbols
0,123456,6,0,0,0,6,0,0,1,0,...,1,0,0,1,0,0.0,0.0,0.0,1.0,0.0
1,12345,5,0,0,0,5,0,0,1,0,...,1,0,0,1,0,0.0,0.0,0.0,1.0,0.0
2,123456789,9,0,0,0,9,0,0,1,0,...,1,0,0,1,0,0.0,0.0,0.0,1.0,0.0
3,password,8,8,0,8,0,0,0,0,0,...,0,0,0,0,0,1.0,0.0,1.0,0.0,0.0
4,iloveyou,8,8,0,8,0,0,0,0,0,...,0,0,0,0,0,1.0,0.0,1.0,0.0,0.0


entropy_features

functions

In [13]:
def shannon_entropy(pwd):
    if not pwd:
        return 0
    freq = {c: pwd.count(c)/len(pwd) for c in set(pwd)}
    return -sum(p * math.log2(p) for p in freq.values())

def ngrams(seq, n=2):
    a, b = tee(seq)
    for _ in range(1, n):
        b = tee(b)[1]
        next(b, None)
    return zip(*a, *b)

def bigram_entropy(pwd):
    pwd = str(pwd)
    if len(pwd) < 2:
        return 0
    bigrams = list(zip(pwd, pwd[1:]))
    freq = {bg: bigrams.count(bg)/len(bigrams) for bg in set(bigrams)}
    return -sum(p * math.log2(p) for p in freq.values())

sequences = ["abcdefghijklmnopqrstuvwxyz", "0123456789", "qwertyuiop", "asdfghjkl", "zxcvbnm"]
def pattern_entropy(pwd):
    pwd = pwd.lower()
    penalty = 0
    for seq in sequences:
        for i in range(len(seq)-2):
            pattern = seq[i:i+3]
            if pattern in pwd:
                penalty += 1
    return max(0, shannon_entropy(pwd) - penalty*0.2)

keyboard_sequences = ["qwertyuiop", "asdfghjkl", "zxcvbnm"]
def keyboard_entropy(pwd):
    pwd = pwd.lower()
    penalty = 0
    for seq in keyboard_sequences:
        for i in range(len(seq)-2):
            if seq[i:i+3] in pwd:
                penalty += 1
    return max(0, shannon_entropy(pwd) - penalty*0.2)

features

In [14]:
df["shannon_entropy"] = df["password"].apply(shannon_entropy)
df["length_adjusted_entropy"] = df["shannon_entropy"] * df["password"].str.len()
df["bigram_entropy"] = df["password"].apply(bigram_entropy)
df["pattern_entropy"] = df["password"].apply(pattern_entropy)
df["keyboard_entropy"] = df["password"].apply(keyboard_entropy)

In [15]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,ratio_letters,ratio_uppercase,ratio_lowercase,ratio_digits,ratio_symbols,shannon_entropy,length_adjusted_entropy,bigram_entropy,pattern_entropy,keyboard_entropy
0,123456,6,0,0,0,6,0,0,1,0,...,0.0,0.0,0.0,1.0,0.0,2.584963,15.509775,2.321928,1.784963,2.584963
1,12345,5,0,0,0,5,0,0,1,0,...,0.0,0.0,0.0,1.0,0.0,2.321928,11.60964,2.0,1.721928,2.321928
2,123456789,9,0,0,0,9,0,0,1,0,...,0.0,0.0,0.0,1.0,0.0,3.169925,28.529325,3.0,1.769925,3.169925
3,password,8,8,0,8,0,0,0,0,0,...,1.0,0.0,1.0,0.0,0.0,2.75,22.0,2.807355,2.75,2.75
4,iloveyou,8,8,0,8,0,0,0,0,0,...,1.0,0.0,1.0,0.0,0.0,2.75,22.0,2.807355,2.75,2.75


PCA combination

In [16]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

ent_cols = [
    "shannon_entropy",
    "length_adjusted_entropy",
    "bigram_entropy",
    "pattern_entropy",
    "keyboard_entropy",
]

#check all num
mask = df[ent_cols].notna().all(axis=1)
X = df.loc[mask, ent_cols].astype(float)

#standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#PCA
pca = PCA(n_components=1)
pc1 = pca.fit_transform(X_scaled)  # shape (n_rows, 1)

#add new column

df.loc[mask, "combined_entropy_pca"] = pc1.ravel()
df.loc[~mask, "combined_entropy_pca"] = np.nan

#normaliz
df["combined_entropy_pca_norm"] = (df["combined_entropy_pca"] - df["combined_entropy_pca"].min()) / (
    df["combined_entropy_pca"].max() - df["combined_entropy_pca"].min()
)

In [17]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,ratio_lowercase,ratio_digits,ratio_symbols,shannon_entropy,length_adjusted_entropy,bigram_entropy,pattern_entropy,keyboard_entropy,combined_entropy_pca,combined_entropy_pca_norm
0,123456,6,0,0,0,6,0,0,1,0,...,0.0,1.0,0.0,2.584963,15.509775,2.321928,1.784963,2.584963,-1.984025,0.15298
1,12345,5,0,0,0,5,0,0,1,0,...,0.0,1.0,0.0,2.321928,11.60964,2.0,1.721928,2.321928,-3.093645,0.137083
2,123456789,9,0,0,0,9,0,0,1,0,...,0.0,1.0,0.0,3.169925,28.529325,3.0,1.769925,3.169925,0.404705,0.187201
3,password,8,8,0,8,0,0,0,0,0,...,1.0,0.0,0.0,2.75,22.0,2.807355,2.75,2.75,0.166464,0.183788
4,iloveyou,8,8,0,8,0,0,0,0,0,...,1.0,0.0,0.0,2.75,22.0,2.807355,2.75,2.75,0.166464,0.183788


Save PCA values for deployment

In [18]:
import joblib

# Save scaler
joblib.dump(scaler, "entropy_scaler.joblib")

# Save PCA
joblib.dump(pca, "entropy_pca.joblib")

# Save min and max of PC1 for normalization
joblib.dump(pc1.min(), "pca_min.joblib")
joblib.dump(pc1.max(), "pca_max.joblib")

['pca_max.joblib']

Structural Features

Position Features

In [18]:
def calculate_position_spread(pwd):
    """Calculate how spread out each character type is (0-1 scale)."""
    pwd = str(pwd)
    length = len(pwd)
    
    if length == 0:
        return {'digit_spread': 0.0, 'letter_spread': 0.0, 'special_spread': 0.0}
    
    digit_positions = [i for i, c in enumerate(pwd) if c.isdigit()]
    letter_positions = [i for i, c in enumerate(pwd) if c.isalpha()]
    special_positions = [i for i, c in enumerate(pwd) if not c.isalnum()]
    
    def spread(positions):
        n = len(positions)
        
        # Edge cases
        if n <= 1:
            return 0.0  # Single or no characters can't be "spread"
        
        # If all characters are of this type, they're clustered
        if n == length:
            return 0.0
        
        # Calculate actual spread
        actual_spread = (max(positions) - min(positions)) / (n - 1)
        
        # Calculate maximum possible spread for n items in length L
        # Maximum spread happens when items are evenly spaced
        max_possible_spread = (length - 1) / (n - 1)
        
        # Normalize to 0-1
        return min(actual_spread / max_possible_spread, 1.0)
    
    return {
        'digit_spread': spread(digit_positions),
        'letter_spread': spread(letter_positions),
        'special_spread': spread(special_positions)
    }

def calculate_position_centered(pwd):
    """Calculate how centered each character type is (0-1 scale)."""
    pwd = str(pwd)
    length = len(pwd)
    
    if length == 0:
        return {'digit_centered': 0.0, 'letter_centered': 0.0}
    
    digit_positions = [i for i, c in enumerate(pwd) if c.isdigit()]
    letter_positions = [i for i, c in enumerate(pwd) if c.isalpha()]
    
    def centered(positions, length):
        if not positions:
            return 0.0
        avg_pos = sum(positions) / len(positions)
        centered_score = 1 - abs(avg_pos/length - 0.5) * 2
        return max(0.0, centered_score)
    
    return {
        'digit_centered': centered(digit_positions, length),
        'letter_centered': centered(letter_positions, length)
    }

In [19]:
position_spread_df = pd.DataFrame(df['password'].apply(calculate_position_spread).tolist())
df = pd.concat([df, position_spread_df], axis=1)

position_centered_df = pd.DataFrame(df['password'].apply(calculate_position_centered).tolist())
df = pd.concat([df, position_centered_df], axis=1)

In [20]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,bigram_entropy,pattern_entropy,keyboard_entropy,combined_entropy_pca,combined_entropy_pca_norm,digit_spread,letter_spread,special_spread,digit_centered,letter_centered
0,123456,6,0,0,0,6,0,0,1,0,...,2.321928,1.784963,2.584963,-1.984025,0.15298,0.0,0.0,0.0,0.833333,0.0
1,12345,5,0,0,0,5,0,0,1,0,...,2.0,1.721928,2.321928,-3.093645,0.137083,0.0,0.0,0.0,0.8,0.0
2,123456789,9,0,0,0,9,0,0,1,0,...,3.0,1.769925,3.169925,0.404705,0.187201,0.0,0.0,0.0,0.888889,0.0
3,password,8,8,0,8,0,0,0,0,0,...,2.807355,2.75,2.75,0.166464,0.183788,0.0,0.0,0.0,0.0,0.875
4,iloveyou,8,8,0,8,0,0,0,0,0,...,2.807355,2.75,2.75,0.166464,0.183788,0.0,0.0,0.0,0.0,0.875


Consecutive Character Features

In [21]:
def max_consecutive_counts(pwd):
    """Calculate maximum consecutive runs for all character types."""
    import re
    
    pwd = str(pwd)
    
    patterns = {
        'digit': r'\d+',
        'letter': r'[A-Za-z]+',
        'upper': r'[A-Z]+',
        'lower': r'[a-z]+',
        'special': r'[^A-Za-z0-9]+'
    }
    
    result = {}
    for name, pattern in patterns.items():
        matches = re.findall(pattern, pwd)
        result[f'max_consecutive_{name}'] = max([len(m) for m in matches], default=0)
        result[f'num_consecutive_{name}_runs'] = len(matches)
        if matches:
            result[f'avg_consecutive_{name}_run'] = sum(len(m) for m in matches) / len(matches)
        else:
            result[f'avg_consecutive_{name}_run'] = 0.0
    
    return result


In [22]:
consecutive_df = pd.DataFrame(df['password'].apply(max_consecutive_counts).tolist())
df = pd.concat([df, consecutive_df], axis=1)

In [23]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,avg_consecutive_letter_run,max_consecutive_upper,num_consecutive_upper_runs,avg_consecutive_upper_run,max_consecutive_lower,num_consecutive_lower_runs,avg_consecutive_lower_run,max_consecutive_special,num_consecutive_special_runs,avg_consecutive_special_run
0,123456,6,0,0,0,6,0,0,1,0,...,0.0,0,0,0.0,0,0,0.0,0,0,0.0
1,12345,5,0,0,0,5,0,0,1,0,...,0.0,0,0,0.0,0,0,0.0,0,0,0.0
2,123456789,9,0,0,0,9,0,0,1,0,...,0.0,0,0,0.0,0,0,0.0,0,0,0.0
3,password,8,8,0,8,0,0,0,0,0,...,8.0,0,0,0.0,8,1,8.0,0,0,0.0
4,iloveyou,8,8,0,8,0,0,0,0,0,...,8.0,0,0,0.0,8,1,8.0,0,0,0.0


Transition Features 

In [24]:
def get_character_type(char):
    """Helper: classify character type."""
    if char.isdigit():
        return 'digit'
    elif char.isalpha():
        return 'upper' if char.isupper() else 'lower'
    else:
        return 'special'

def calculate_transitions(pwd):
    """Calculate all transition-related features."""
    pwd = str(pwd)
    length = len(pwd)
    
    if length < 2:
        return {
            'num_transitions': 0,
            'transition_frequency': 0.0,
            'letter_to_digit': 0,
            'digit_to_letter': 0,
            'to_special': 0,
            'from_special': 0,
            'alternating_pattern_score': 0.0
        }
    
    # Basic transitions
    transitions = 0
    prev_type = get_character_type(pwd[0])
    
    # Specific transition counters
    letter_to_digit = 0
    digit_to_letter = 0
    to_special = 0
    from_special = 0
    
    for char in pwd[1:]:
        current_type = get_character_type(char)
        
        if current_type != prev_type:
            transitions += 1
            
            # Count specific transitions
            if prev_type in ['upper', 'lower'] and current_type == 'digit':
                letter_to_digit += 1
            elif prev_type == 'digit' and current_type in ['upper', 'lower']:
                digit_to_letter += 1
            if current_type == 'special':
                to_special += 1
            if prev_type == 'special':
                from_special += 1
        
        prev_type = current_type
    
    # Alternating pattern score (like a1b2c3)
    alternating_patterns = 0
    for i in range(length - 2):
        type1 = get_character_type(pwd[i])
        type2 = get_character_type(pwd[i+1])
        type3 = get_character_type(pwd[i+2])
        
        if type1 == type3 and type1 != type2:
            alternating_patterns += 1
    
    alternating_score = alternating_patterns / (length - 2) if length > 2 else 0.0
    
    return {
        'num_transitions': transitions,
        'transition_frequency': transitions / (length - 1),
        'letter_to_digit': letter_to_digit,
        'digit_to_letter': digit_to_letter,
        'to_special': to_special,
        'from_special': from_special,
        'alternating_pattern_score': alternating_score
    }

In [25]:
transitions_df = pd.DataFrame(df['password'].apply(calculate_transitions).tolist())
df = pd.concat([df, transitions_df], axis=1)

df['transitions_to_length_ratio'] = df['num_transitions'] / df['length']
df['transitions_to_length_ratio'] = df['transitions_to_length_ratio'].fillna(0)

In [26]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,num_consecutive_special_runs,avg_consecutive_special_run,num_transitions,transition_frequency,letter_to_digit,digit_to_letter,to_special,from_special,alternating_pattern_score,transitions_to_length_ratio
0,123456,6,0,0,0,6,0,0,1,0,...,0,0.0,0,0.0,0,0,0,0,0.0,0.0
1,12345,5,0,0,0,5,0,0,1,0,...,0,0.0,0,0.0,0,0,0,0,0.0,0.0
2,123456789,9,0,0,0,9,0,0,1,0,...,0,0.0,0,0.0,0,0,0,0,0.0,0.0
3,password,8,8,0,8,0,0,0,0,0,...,0,0.0,0,0.0,0,0,0,0,0.0,0.0
4,iloveyou,8,8,0,8,0,0,0,0,0,...,0,0.0,0,0.0,0,0,0,0,0.0,0.0


longest_char streak and didgit letter mixing Features

In [27]:
def longest_same_char_streak(pwd):
    """Find longest streak of the same character."""
    pwd = str(pwd)
    if not pwd:
        return 0
    
    max_streak = 1
    current_streak = 1
    
    for i in range(1, len(pwd)):
        if pwd[i] == pwd[i-1]:
            current_streak += 1
            max_streak = max(max_streak, current_streak)
        else:
            current_streak = 1
    
    return max_streak

def digit_letter_mixing_score(pwd):
    """Calculate how mixed digits and letters are (0-1)."""
    pwd = str(pwd)
    
    digit_positions = [i for i, c in enumerate(pwd) if c.isdigit()]
    letter_positions = [i for i, c in enumerate(pwd) if c.isalpha()]
    
    if not digit_positions or not letter_positions:
        return 0.0
    
    # Check if digits and letters are interspersed
    min_digit = min(digit_positions)
    max_digit = max(digit_positions)
    min_letter = min(letter_positions)
    max_letter = max(letter_positions)
    
    # Calculate overlap
    overlap_start = max(min_digit, min_letter)
    overlap_end = min(max_digit, max_letter)
    overlap = max(0, overlap_end - overlap_start)
    
    if overlap <= 0:  # No overlap - completely separated
        return 0.0
    else:  # Some overlap - mixed
        total_span = max(max_digit, max_letter) - min(min_digit, min_letter)
        return overlap / total_span if total_span > 0 else 0.0

In [28]:
df['longest_same_char_streak'] = df['password'].apply(longest_same_char_streak)
df['digit_letter_mixing_score'] = df['password'].apply(digit_letter_mixing_score)

In [29]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,num_transitions,transition_frequency,letter_to_digit,digit_to_letter,to_special,from_special,alternating_pattern_score,transitions_to_length_ratio,longest_same_char_streak,digit_letter_mixing_score
0,123456,6,0,0,0,6,0,0,1,0,...,0,0.0,0,0,0,0,0.0,0.0,1,0.0
1,12345,5,0,0,0,5,0,0,1,0,...,0,0.0,0,0,0,0,0.0,0.0,1,0.0
2,123456789,9,0,0,0,9,0,0,1,0,...,0,0.0,0,0,0,0,0.0,0.0,1,0.0
3,password,8,8,0,8,0,0,0,0,0,...,0,0.0,0,0,0,0,0.0,0.0,2,0.0
4,iloveyou,8,8,0,8,0,0,0,0,0,...,0,0.0,0,0,0,0,0.0,0.0,1,0.0


Dict features

In [30]:
import ahocorasick

#Convert “leet speak” passwords to normal letters so dictionary matching works.
LEET_MAP = str.maketrans({
    '0': 'o', '1': 'i', '3': 'e', '4': 'a',
    '5': 's', '7': 't', '@': 'a', '$': 's'
})

def normalize_password(pw):
    return str(pw).lower().translate(LEET_MAP)

#fast string-search structure
def build_automaton(word_set):
    A = ahocorasick.Automaton()
    for w in word_set:
        A.add_word(w, w)
    A.make_automaton()
    return A


with open("Dict/words_alpha.txt") as f:
    ENGLISH_WORDS = {line.strip().lower() for line in f if line.strip()}
    
WORD_AUTOMATON = build_automaton(ENGLISH_WORDS)  


with open("Dict/names.txt") as f:
    NAMES = {line.strip().lower() for line in f if line.strip()}
    
NAME_AUTOMATON = build_automaton(NAMES)         

In [31]:
import re

YEAR_RE = re.compile(r"(19[5-9]\d|20[0-2]\d)")

In [32]:
def contains_dictionary_word(pw):
    pw_norm = normalize_password(pw)
    for _, _ in WORD_AUTOMATON.iter(pw_norm):
        return 1
    return 0

def longest_dictionary_word_length(pw):
    pw_norm = normalize_password(pw)
    max_len = 0
    for _, w in WORD_AUTOMATON.iter(pw_norm):
        max_len = max(max_len, len(w))
    return max_len

def dictionary_coverage_ratio(pw):
    pw_norm = normalize_password(pw)
    total_chars = 0
    for _, w in WORD_AUTOMATON.iter(pw_norm):
        total_chars += len(w)
    return total_chars / max(len(pw), 1)

def num_dictionary_words(pw):
    pw_norm = normalize_password(pw)
    words = set()
    for _, w in WORD_AUTOMATON.iter(pw_norm):
        words.add(w)
    return len(words)

In [33]:
df['contains_dictionary_word'] = df['password'].apply(contains_dictionary_word)
df['longest_dictionary_word_length'] = df['password'].apply(longest_dictionary_word_length)
df['dictionary_coverage_ratio'] = df['password'].apply(dictionary_coverage_ratio)
df['num_dictionary_words'] = df['password'].apply(num_dictionary_words)

In [34]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,to_special,from_special,alternating_pattern_score,transitions_to_length_ratio,longest_same_char_streak,digit_letter_mixing_score,contains_dictionary_word,longest_dictionary_word_length,dictionary_coverage_ratio,num_dictionary_words
0,123456,6,0,0,0,6,0,0,1,0,...,0,0,0.0,0.0,1,0.0,1,2,1.333333,6
1,12345,5,0,0,0,5,0,0,1,0,...,0,0,0.0,0.0,1,0.0,1,2,1.6,6
2,123456789,9,0,0,0,9,0,0,1,0,...,0,0,0.0,0.0,1,0.0,1,2,1.0,7
3,password,8,8,0,8,0,0,0,0,0,...,0,0,0.0,0.0,2,0.0,1,8,6.5,21
4,iloveyou,8,8,0,8,0,0,0,0,0,...,0,0,0.0,0.0,1,0.0,1,5,4.0,15


In [35]:
def contains_name(pw):
    pw_norm = normalize_password(pw)
    for _, _ in NAME_AUTOMATON.iter(pw_norm):
        return 1
    return 0

def contains_common_name(pw):
    pw_norm = normalize_password(pw)
    for _, n in NAME_AUTOMATON.iter(pw_norm):
        if len(n) >= 4:  # define “common” as >=4 chars
            return 1
    return 0

In [36]:
df['contains_name'] = df['password'].apply(contains_name)
df['contains_common_name'] = df['password'].apply(contains_common_name)

In [37]:
# print(df[['contains_name','contains_common_name','year_position']].head())
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,alternating_pattern_score,transitions_to_length_ratio,longest_same_char_streak,digit_letter_mixing_score,contains_dictionary_word,longest_dictionary_word_length,dictionary_coverage_ratio,num_dictionary_words,contains_name,contains_common_name
0,123456,6,0,0,0,6,0,0,1,0,...,0.0,0.0,1,0.0,1,2,1.333333,6,0,0
1,12345,5,0,0,0,5,0,0,1,0,...,0.0,0.0,1,0.0,1,2,1.6,6,0,0
2,123456789,9,0,0,0,9,0,0,1,0,...,0.0,0.0,1,0.0,1,2,1.0,7,0,0
3,password,8,8,0,8,0,0,0,0,0,...,0.0,0.0,2,0.0,1,8,6.5,21,1,0
4,iloveyou,8,8,0,8,0,0,0,0,0,...,0.0,0.0,1,0.0,1,5,4.0,15,0,0


In [38]:
def contains_year(pw):
    return int(bool(YEAR_RE.search(str(pw))))

def contains_birth_year(pw):
    m = YEAR_RE.search(str(pw))
    if not m:
        return 0
    year = int(m.group())
    return int(1980 <= year <= 2010)

def year_position(pw):
    m = YEAR_RE.search(str(pw))
    if not m:
        return "none"
    start, end = m.start(), m.end()
    L = len(pw)
    if start == 0:
        return "start"
    elif end == L:
        return "end"
    else:
        return "middle"

In [39]:
df['contains_year'] = df['password'].apply(contains_year)
df['contains_birth_year'] = df['password'].apply(contains_birth_year)
df['year_position'] = df['password'].apply(year_position)

In [40]:
# print(df[['contains_year','contains_birth_year','year_position']].head())
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,digit_letter_mixing_score,contains_dictionary_word,longest_dictionary_word_length,dictionary_coverage_ratio,num_dictionary_words,contains_name,contains_common_name,contains_year,contains_birth_year,year_position
0,123456,6,0,0,0,6,0,0,1,0,...,0.0,1,2,1.333333,6,0,0,0,0,none
1,12345,5,0,0,0,5,0,0,1,0,...,0.0,1,2,1.6,6,0,0,0,0,none
2,123456789,9,0,0,0,9,0,0,1,0,...,0.0,1,2,1.0,7,0,0,0,0,none
3,password,8,8,0,8,0,0,0,0,0,...,0.0,1,8,6.5,21,1,0,0,0,none
4,iloveyou,8,8,0,8,0,0,0,0,0,...,0.0,1,5,4.0,15,0,0,0,0,none


Check all features until now

In [41]:
for col in df.columns: print(col)

password
length
num_letters
num_upper
num_lower
num_digits
num_special_char
has_upper
has_num
has_special
first_is_upper
first_is_digit
first_is_special
last_is_upper
last_is_digit
last_is_special
ratio_letters
ratio_uppercase
ratio_lowercase
ratio_digits
ratio_symbols
shannon_entropy
length_adjusted_entropy
bigram_entropy
pattern_entropy
keyboard_entropy
combined_entropy_pca
combined_entropy_pca_norm
digit_spread
letter_spread
special_spread
digit_centered
letter_centered
max_consecutive_digit
num_consecutive_digit_runs
avg_consecutive_digit_run
max_consecutive_letter
num_consecutive_letter_runs
avg_consecutive_letter_run
max_consecutive_upper
num_consecutive_upper_runs
avg_consecutive_upper_run
max_consecutive_lower
num_consecutive_lower_runs
avg_consecutive_lower_run
max_consecutive_special
num_consecutive_special_runs
avg_consecutive_special_run
num_transitions
transition_frequency
letter_to_digit
digit_to_letter
to_special
from_special
alternating_pattern_score
transitions_to_leng

Save dataset

In [42]:
df.to_csv("E:/New SW PR/backend/Ignored datasets/Final Full/full_before_zxcvbn.csv", index=False)

zxcvbn

In [3]:
zxcvbn_df = pd.read_csv(
    "E:/New SW PR/backend/Ignored datasets/rockyou_zxcvbn_features.csv",
    encoding="utf-8",
    keep_default_na=False
)

  zxcvbn_df = pd.read_csv(


In [4]:
zxcvbn_cols = [
    'zxcvbn_score',
    'zxcvbn_guesses',
    'zxcvbn_crack_time_seconds',
    'zxcvbn_log10_guesses'
]

for col in zxcvbn_cols:
    zxcvbn_df[col] = pd.to_numeric(zxcvbn_df[col], errors='coerce')

df = df.merge(
    zxcvbn_df,
    on="password",
    how="left",      # keep all passwords
    validate="m:m"
)

In [5]:
print(df[zxcvbn_cols].isna().mean())

zxcvbn_score                 0.000052
zxcvbn_guesses               0.000052
zxcvbn_crack_time_seconds    0.000052
zxcvbn_log10_guesses         0.000052
dtype: float64


In [6]:
# print(df[['password','zxcvbn_score','zxcvbn_guesses','zxcvbn_crack_time_seconds','zxcvbn_log10_guesses']].head(10))
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,num_dictionary_words,contains_name,contains_common_name,contains_year,contains_birth_year,year_position,zxcvbn_score,zxcvbn_guesses,zxcvbn_crack_time_seconds,zxcvbn_log10_guesses
0,123456,6,0,0,0,6,0,0,1,0,...,6,0,0,0,0,none,0.0,2.0,0.0002,0.30103
1,12345,5,0,0,0,5,0,0,1,0,...,6,0,0,0,0,none,0.0,7.0,0.0007,0.845098
2,123456789,9,0,0,0,9,0,0,1,0,...,7,0,0,0,0,none,0.0,6.0,0.0006,0.778151
3,password,8,8,0,8,0,0,0,0,0,...,21,1,0,0,0,none,0.0,3.0,0.0003,0.477121
4,iloveyou,8,8,0,8,0,0,0,0,0,...,15,0,0,0,0,none,0.0,48.0,0.0048,1.681241


save dataset

In [7]:
df.to_csv("E:/New SW PR/backend/Ignored datasets/Final Full/full_after_zxcvbn.csv", index=False)

PCFG & OMEN

Merge the new PCFG OMEN csv

In [9]:
pcfg_omen_df = pd.read_csv("E:/New SW PR/backend/Ignored datasets/rockyou_dataset_cleaned_pcfg_scores.csv")

pcfg_omen_cols = ['password', 'PCFG_probability', 'OMEN_level']

df = df.merge(
    pcfg_omen_df[pcfg_omen_cols],
    on='password',
    how='left',                 # keep all passwords in df
    validate='m:m'       # fail fast if duplicates exist
)


In [10]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,contains_common_name,contains_year,contains_birth_year,year_position,zxcvbn_score,zxcvbn_guesses,zxcvbn_crack_time_seconds,zxcvbn_log10_guesses,PCFG_probability,OMEN_level
0,123456,6,0,0,0,6,0,0,1,0,...,0,0,0,none,0.0,2.0,0.0002,0.30103,0.000191,0.0
1,12345,5,0,0,0,5,0,0,1,0,...,0,0,0,none,0.0,7.0,0.0007,0.845098,8.8e-05,2.0
2,123456789,9,0,0,0,9,0,0,1,0,...,0,0,0,none,0.0,6.0,0.0006,0.778151,0.000167,3.0
3,password,8,8,0,8,0,0,0,0,0,...,0,0,0,none,0.0,3.0,0.0003,0.477121,4.7e-05,4.0
4,iloveyou,8,8,0,8,0,0,0,0,0,...,0,0,0,none,0.0,48.0,0.0048,1.681241,5.2e-05,4.0


In [11]:
df.to_csv("E:/New SW PR/backend/Ignored datasets/full_after_PCFG_OMEN_merge.csv", index=False)

Omen based features

In [14]:
# Handle missing values
df['OMEN_level'] = df['OMEN_level'].fillna(0).astype(int)

# OMEN numeric features
max_omen = df['OMEN_level'].max() if 'OMEN_level' in df else 25
df['omen_level_norm'] = df['OMEN_level'] / float(max_omen)
df['omen_log10'] = np.log10(df['OMEN_level'] + 1)  # add 1 to avoid log(0)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [15]:
# print(merged_df[['password','OMEN_level', 'omen_level_norm', 'omen_log10']].head())
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,contains_birth_year,year_position,zxcvbn_score,zxcvbn_guesses,zxcvbn_crack_time_seconds,zxcvbn_log10_guesses,PCFG_probability,OMEN_level,omen_level_norm,omen_log10
0,123456,6,0,0,0,6,0,0,1,0,...,0,none,0.0,2.0,0.0002,0.30103,0.000191,0,0.0,0.0
1,12345,5,0,0,0,5,0,0,1,0,...,0,none,0.0,7.0,0.0007,0.845098,8.8e-05,2,0.028169,0.477121
2,123456789,9,0,0,0,9,0,0,1,0,...,0,none,0.0,6.0,0.0006,0.778151,0.000167,3,0.042254,0.60206
3,password,8,8,0,8,0,0,0,0,0,...,0,none,0.0,3.0,0.0003,0.477121,4.7e-05,4,0.056338,0.69897
4,iloveyou,8,8,0,8,0,0,0,0,0,...,0,none,0.0,48.0,0.0048,1.681241,5.2e-05,4,0.056338,0.69897


PCFG based features

In [16]:
# Handle missing values
df['PCFG_probability'] = df['PCFG_probability'].fillna(1e-50)  # to avoid log(0)

# PCFG log features
df['pcfg_log10_prob'] = np.log10(df['PCFG_probability'])
df['pcfg_neglog10_prob'] = -df['pcfg_log10_prob']  # “surprisal” measure

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [17]:
# print(merged_df[['password', 'PCFG_probability', 'pcfg_log10_prob', 'pcfg_neglog10_prob',]].head())
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,zxcvbn_score,zxcvbn_guesses,zxcvbn_crack_time_seconds,zxcvbn_log10_guesses,PCFG_probability,OMEN_level,omen_level_norm,omen_log10,pcfg_log10_prob,pcfg_neglog10_prob
0,123456,6,0,0,0,6,0,0,1,0,...,0.0,2.0,0.0002,0.30103,0.000191,0,0.0,0.0,-3.719983,3.719983
1,12345,5,0,0,0,5,0,0,1,0,...,0.0,7.0,0.0007,0.845098,8.8e-05,2,0.028169,0.477121,-4.055246,4.055246
2,123456789,9,0,0,0,9,0,0,1,0,...,0.0,6.0,0.0006,0.778151,0.000167,3,0.042254,0.60206,-3.776698,3.776698
3,password,8,8,0,8,0,0,0,0,0,...,0.0,3.0,0.0003,0.477121,4.7e-05,4,0.056338,0.69897,-4.328364,4.328364
4,iloveyou,8,8,0,8,0,0,0,0,0,...,0.0,48.0,0.0048,1.681241,5.2e-05,4,0.056338,0.69897,-4.281302,4.281302


In [18]:
for col in df.columns: print(col)

password
length
num_letters
num_upper
num_lower
num_digits
num_special_char
has_upper
has_num
has_special
first_is_upper
first_is_digit
first_is_special
last_is_upper
last_is_digit
last_is_special
ratio_letters
ratio_uppercase
ratio_lowercase
ratio_digits
ratio_symbols
shannon_entropy
length_adjusted_entropy
bigram_entropy
pattern_entropy
keyboard_entropy
combined_entropy_pca
combined_entropy_pca_norm
digit_spread
letter_spread
special_spread
digit_centered
letter_centered
max_consecutive_digit
num_consecutive_digit_runs
avg_consecutive_digit_run
max_consecutive_letter
num_consecutive_letter_runs
avg_consecutive_letter_run
max_consecutive_upper
num_consecutive_upper_runs
avg_consecutive_upper_run
max_consecutive_lower
num_consecutive_lower_runs
avg_consecutive_lower_run
max_consecutive_special
num_consecutive_special_runs
avg_consecutive_special_run
num_transitions
transition_frequency
letter_to_digit
digit_to_letter
to_special
from_special
alternating_pattern_score
transitions_to_leng

Save dataset before adding target column

In [None]:
df.to_csv("E:/New SW PR/backend/Ignored datasets/full_before_target_label.csv.csv", index=False)

Target label

In [21]:
# df.head()

In [None]:
# for col in df.columns: print(col)

In [None]:
# # Save updated dataset
# df.to_csv("E:/New SW PR/backend/Ignored datasets/full_with_target_label.csv.csv", index=False)