In [None]:
import pandas as pd
import math
from itertools import tee

Import the new CSV file

In [57]:
df = pd.read_csv("rockyou_dataset.csv", encoding="utf-8",keep_default_na=False)

In [58]:
df.head()

Unnamed: 0,password
0,123456
1,12345
2,123456789
3,password
4,iloveyou


In [59]:
df["password"] = df["password"].fillna("").astype(str)
df = df[df["password"].str.len() > 0].reset_index(drop=True)

In [60]:
df.head()

Unnamed: 0,password
0,123456
1,12345
2,123456789
3,password
4,iloveyou


Extract charactaristic features

num_of_features

In [61]:
# total length
df["length"] = df["password"].astype(str).str.len()
#num of letters
df["num_letters"] = df["password"].str.count(r"[A-Za-z]")
df["num_upper"] = df["password"].str.count(r"[A-Z]")
df["num_lower"] = df["password"].str.count(r"[a-z]")
#num of digits 
df["num_digits"] = df["password"].str.count(r"[0-9]")
#num of special characters
df["num_special_char"] = df["password"].str.count(r"[^A-Za-z0-9]")

In [62]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char
0,123456,6,0,0,0,6,0
1,12345,5,0,0,0,5,0
2,123456789,9,0,0,0,9,0
3,password,8,8,0,8,0,0
4,iloveyou,8,8,0,8,0,0


Boolean_features

In [63]:
#general
df["has_upper"] = df["num_upper"] > 0
df["has_num"] = df["num_digits"] > 0
df["has_special"] = df["num_special_char"] > 0
# First character
df["first_is_upper"] = df["password"].str.match(r"^[A-Z]")
df["first_is_digit"] = df["password"].str.match(r"^[0-9]")
df["first_is_special"] = df["password"].str.match(r"^[^A-Za-z0-9]")
# Last character
df["last_is_upper"] = df["password"].str[-1].str.match(r"[A-Z]")
df["last_is_digit"] = df["password"].str[-1].str.match(r"[0-9]")
df["last_is_special"] = df["password"].str[-1].str.match(r"[^A-Za-z0-9]")

In [64]:
# Convert all boolean columns to 0/1
bool_cols = ['has_upper', 'has_num', 'has_special', 
             'first_is_upper', 'first_is_digit',
             'first_is_special', 'last_is_upper', 'last_is_digit', 'last_is_special']

df[bool_cols] = df[bool_cols].astype(int)

In [65]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,first_is_upper,first_is_digit,first_is_special,last_is_upper,last_is_digit,last_is_special
0,123456,6,0,0,0,6,0,0,1,0,0,1,0,0,1,0
1,12345,5,0,0,0,5,0,0,1,0,0,1,0,0,1,0
2,123456789,9,0,0,0,9,0,0,1,0,0,1,0,0,1,0
3,password,8,8,0,8,0,0,0,0,0,0,0,0,0,0,0
4,iloveyou,8,8,0,8,0,0,0,0,0,0,0,0,0,0,0


ratio_features

In [66]:
df["ratio_letters"] = df["num_letters"] / df["length"]
df["ratio_uppercase"] = df["num_upper"] / df["length"]
df["ratio_lowercase"] = df["num_lower"] / df["length"]

df["ratio_digits"] = df["num_digits"] / df["length"]

df["ratio_symbols"] = df["num_special_char"] / df["length"]

In [67]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,first_is_digit,first_is_special,last_is_upper,last_is_digit,last_is_special,ratio_letters,ratio_uppercase,ratio_lowercase,ratio_digits,ratio_symbols
0,123456,6,0,0,0,6,0,0,1,0,...,1,0,0,1,0,0.0,0.0,0.0,1.0,0.0
1,12345,5,0,0,0,5,0,0,1,0,...,1,0,0,1,0,0.0,0.0,0.0,1.0,0.0
2,123456789,9,0,0,0,9,0,0,1,0,...,1,0,0,1,0,0.0,0.0,0.0,1.0,0.0
3,password,8,8,0,8,0,0,0,0,0,...,0,0,0,0,0,1.0,0.0,1.0,0.0,0.0
4,iloveyou,8,8,0,8,0,0,0,0,0,...,0,0,0,0,0,1.0,0.0,1.0,0.0,0.0


entropy_features

functions

In [68]:
def shannon_entropy(pwd):
    if not pwd:
        return 0
    freq = {c: pwd.count(c)/len(pwd) for c in set(pwd)}
    return -sum(p * math.log2(p) for p in freq.values())

def ngrams(seq, n=2):
    a, b = tee(seq)
    for _ in range(1, n):
        b = tee(b)[1]
        next(b, None)
    return zip(*a, *b)

def bigram_entropy(pwd):
    pwd = str(pwd)
    if len(pwd) < 2:
        return 0
    bigrams = list(zip(pwd, pwd[1:]))
    freq = {bg: bigrams.count(bg)/len(bigrams) for bg in set(bigrams)}
    return -sum(p * math.log2(p) for p in freq.values())

sequences = ["abcdefghijklmnopqrstuvwxyz", "0123456789", "qwertyuiop", "asdfghjkl", "zxcvbnm"]
def pattern_entropy(pwd):
    pwd = pwd.lower()
    penalty = 0
    for seq in sequences:
        for i in range(len(seq)-2):
            pattern = seq[i:i+3]
            if pattern in pwd:
                penalty += 1
    return max(0, shannon_entropy(pwd) - penalty*0.2)

keyboard_sequences = ["qwertyuiop", "asdfghjkl", "zxcvbnm"]
def keyboard_entropy(pwd):
    pwd = pwd.lower()
    penalty = 0
    for seq in keyboard_sequences:
        for i in range(len(seq)-2):
            if seq[i:i+3] in pwd:
                penalty += 1
    return max(0, shannon_entropy(pwd) - penalty*0.2)

features

In [69]:
df["shannon_entropy"] = df["password"].apply(shannon_entropy)
df["length_adjusted_entropy"] = df["shannon_entropy"] * df["password"].str.len()
df["bigram_entropy"] = df["password"].apply(bigram_entropy)
df["pattern_entropy"] = df["password"].apply(pattern_entropy)
df["keyboard_entropy"] = df["password"].apply(keyboard_entropy)

In [70]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,ratio_letters,ratio_uppercase,ratio_lowercase,ratio_digits,ratio_symbols,shannon_entropy,length_adjusted_entropy,bigram_entropy,pattern_entropy,keyboard_entropy
0,123456,6,0,0,0,6,0,0,1,0,...,0.0,0.0,0.0,1.0,0.0,2.584963,15.509775,2.321928,1.784963,2.584963
1,12345,5,0,0,0,5,0,0,1,0,...,0.0,0.0,0.0,1.0,0.0,2.321928,11.60964,2.0,1.721928,2.321928
2,123456789,9,0,0,0,9,0,0,1,0,...,0.0,0.0,0.0,1.0,0.0,3.169925,28.529325,3.0,1.769925,3.169925
3,password,8,8,0,8,0,0,0,0,0,...,1.0,0.0,1.0,0.0,0.0,2.75,22.0,2.807355,2.75,2.75
4,iloveyou,8,8,0,8,0,0,0,0,0,...,1.0,0.0,1.0,0.0,0.0,2.75,22.0,2.807355,2.75,2.75


PCA combination

In [71]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

ent_cols = [
    "shannon_entropy",
    "length_adjusted_entropy",
    "bigram_entropy",
    "pattern_entropy",
    "keyboard_entropy",
]

#check all num
mask = df[ent_cols].notna().all(axis=1)
X = df.loc[mask, ent_cols].astype(float)

#standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#PCA
pca = PCA(n_components=1)
pc1 = pca.fit_transform(X_scaled)  # shape (n_rows, 1)

#add new column

df.loc[mask, "combined_entropy_pca"] = pc1.ravel()
df.loc[~mask, "combined_entropy_pca"] = np.nan

#normaliz
df["combined_entropy_pca_norm"] = (df["combined_entropy_pca"] - df["combined_entropy_pca"].min()) / (
    df["combined_entropy_pca"].max() - df["combined_entropy_pca"].min()
)

In [72]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,ratio_lowercase,ratio_digits,ratio_symbols,shannon_entropy,length_adjusted_entropy,bigram_entropy,pattern_entropy,keyboard_entropy,combined_entropy_pca,combined_entropy_pca_norm
0,123456,6,0,0,0,6,0,0,1,0,...,0.0,1.0,0.0,2.584963,15.509775,2.321928,1.784963,2.584963,-1.984025,0.15298
1,12345,5,0,0,0,5,0,0,1,0,...,0.0,1.0,0.0,2.321928,11.60964,2.0,1.721928,2.321928,-3.093645,0.137083
2,123456789,9,0,0,0,9,0,0,1,0,...,0.0,1.0,0.0,3.169925,28.529325,3.0,1.769925,3.169925,0.404705,0.187201
3,password,8,8,0,8,0,0,0,0,0,...,1.0,0.0,0.0,2.75,22.0,2.807355,2.75,2.75,0.166464,0.183788
4,iloveyou,8,8,0,8,0,0,0,0,0,...,1.0,0.0,0.0,2.75,22.0,2.807355,2.75,2.75,0.166464,0.183788


Structural Features

Position Features

In [73]:
def calculate_position_spread(pwd):
    """Calculate how spread out each character type is (0-1 scale)."""
    pwd = str(pwd)
    length = len(pwd)
    
    if length == 0:
        return {'digit_spread': 0.0, 'letter_spread': 0.0, 'special_spread': 0.0}
    
    digit_positions = [i for i, c in enumerate(pwd) if c.isdigit()]
    letter_positions = [i for i, c in enumerate(pwd) if c.isalpha()]
    special_positions = [i for i, c in enumerate(pwd) if not c.isalnum()]
    
    def spread(positions):
        if len(positions) < 2:
            return 0.0
        return (max(positions) - min(positions)) / (len(positions) - 1)
    
    return {
        'digit_spread': spread(digit_positions),
        'letter_spread': spread(letter_positions),
        'special_spread': spread(special_positions)
    }

def calculate_position_centered(pwd):
    """Calculate how centered each character type is (0-1 scale)."""
    pwd = str(pwd)
    length = len(pwd)
    
    if length == 0:
        return {'digit_centered': 0.0, 'letter_centered': 0.0}
    
    digit_positions = [i for i, c in enumerate(pwd) if c.isdigit()]
    letter_positions = [i for i, c in enumerate(pwd) if c.isalpha()]
    
    def centered(positions, length):
        if not positions:
            return 0.0
        avg_pos = sum(positions) / len(positions)
        centered_score = 1 - abs(avg_pos/length - 0.5) * 2
        return max(0.0, centered_score)
    
    return {
        'digit_centered': centered(digit_positions, length),
        'letter_centered': centered(letter_positions, length)
    }

In [74]:
position_spread_df = pd.DataFrame(df['password'].apply(calculate_position_spread).tolist())
df = pd.concat([df, position_spread_df], axis=1)

position_centered_df = pd.DataFrame(df['password'].apply(calculate_position_centered).tolist())
df = pd.concat([df, position_centered_df], axis=1)

In [75]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,bigram_entropy,pattern_entropy,keyboard_entropy,combined_entropy_pca,combined_entropy_pca_norm,digit_spread,letter_spread,special_spread,digit_centered,letter_centered
0,123456,6,0,0,0,6,0,0,1,0,...,2.321928,1.784963,2.584963,-1.984025,0.15298,1.0,0.0,0.0,0.833333,0.0
1,12345,5,0,0,0,5,0,0,1,0,...,2.0,1.721928,2.321928,-3.093645,0.137083,1.0,0.0,0.0,0.8,0.0
2,123456789,9,0,0,0,9,0,0,1,0,...,3.0,1.769925,3.169925,0.404705,0.187201,1.0,0.0,0.0,0.888889,0.0
3,password,8,8,0,8,0,0,0,0,0,...,2.807355,2.75,2.75,0.166464,0.183788,0.0,1.0,0.0,0.0,0.875
4,iloveyou,8,8,0,8,0,0,0,0,0,...,2.807355,2.75,2.75,0.166464,0.183788,0.0,1.0,0.0,0.0,0.875


Consecutive Character Features

In [76]:
def max_consecutive_counts(pwd):
    """Calculate maximum consecutive runs for all character types."""
    import re
    
    pwd = str(pwd)
    
    patterns = {
        'digit': r'\d+',
        'letter': r'[A-Za-z]+',
        'upper': r'[A-Z]+',
        'lower': r'[a-z]+',
        'special': r'[^A-Za-z0-9]+'
    }
    
    result = {}
    for name, pattern in patterns.items():
        matches = re.findall(pattern, pwd)
        result[f'max_consecutive_{name}'] = max([len(m) for m in matches], default=0)
        result[f'num_consecutive_{name}_runs'] = len(matches)
        if matches:
            result[f'avg_consecutive_{name}_run'] = sum(len(m) for m in matches) / len(matches)
        else:
            result[f'avg_consecutive_{name}_run'] = 0.0
    
    return result


In [78]:
consecutive_df = pd.DataFrame(df['password'].apply(max_consecutive_counts).tolist())
df = pd.concat([df, consecutive_df], axis=1)

MemoryError: Unable to allocate 109. MiB for an array with shape (14329769,) and data type uint64

In [None]:
df.head()

Transition Features 

In [79]:
def get_character_type(char):
    """Helper: classify character type."""
    if char.isdigit():
        return 'digit'
    elif char.isalpha():
        return 'upper' if char.isupper() else 'lower'
    else:
        return 'special'

def calculate_transitions(pwd):
    """Calculate all transition-related features."""
    pwd = str(pwd)
    length = len(pwd)
    
    if length < 2:
        return {
            'num_transitions': 0,
            'transition_frequency': 0.0,
            'letter_to_digit': 0,
            'digit_to_letter': 0,
            'to_special': 0,
            'from_special': 0,
            'alternating_pattern_score': 0.0
        }
    
    # Basic transitions
    transitions = 0
    prev_type = get_character_type(pwd[0])
    
    # Specific transition counters
    letter_to_digit = 0
    digit_to_letter = 0
    to_special = 0
    from_special = 0
    
    for char in pwd[1:]:
        current_type = get_character_type(char)
        
        if current_type != prev_type:
            transitions += 1
            
            # Count specific transitions
            if prev_type in ['upper', 'lower'] and current_type == 'digit':
                letter_to_digit += 1
            elif prev_type == 'digit' and current_type in ['upper', 'lower']:
                digit_to_letter += 1
            if current_type == 'special':
                to_special += 1
            if prev_type == 'special':
                from_special += 1
        
        prev_type = current_type
    
    # Alternating pattern score (like a1b2c3)
    alternating_patterns = 0
    for i in range(length - 2):
        type1 = get_character_type(pwd[i])
        type2 = get_character_type(pwd[i+1])
        type3 = get_character_type(pwd[i+2])
        
        if type1 == type3 and type1 != type2:
            alternating_patterns += 1
    
    alternating_score = alternating_patterns / (length - 2) if length > 2 else 0.0
    
    return {
        'num_transitions': transitions,
        'transition_frequency': transitions / (length - 1),
        'letter_to_digit': letter_to_digit,
        'digit_to_letter': digit_to_letter,
        'to_special': to_special,
        'from_special': from_special,
        'alternating_pattern_score': alternating_score
    }

In [80]:
transitions_df = pd.DataFrame(df['password'].apply(calculate_transitions).tolist())
df = pd.concat([df, transitions_df], axis=1)

df['transitions_to_length_ratio'] = df['num_transitions'] / df['length']
df['transitions_to_length_ratio'] = df['transitions_to_length_ratio'].fillna(0)

In [81]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,digit_centered,letter_centered,num_transitions,transition_frequency,letter_to_digit,digit_to_letter,to_special,from_special,alternating_pattern_score,transitions_to_length_ratio
0,123456,6,0,0,0,6,0,0,1,0,...,0.833333,0.0,0,0.0,0,0,0,0,0.0,0.0
1,12345,5,0,0,0,5,0,0,1,0,...,0.8,0.0,0,0.0,0,0,0,0,0.0,0.0
2,123456789,9,0,0,0,9,0,0,1,0,...,0.888889,0.0,0,0.0,0,0,0,0,0.0,0.0
3,password,8,8,0,8,0,0,0,0,0,...,0.0,0.875,0,0.0,0,0,0,0,0.0,0.0
4,iloveyou,8,8,0,8,0,0,0,0,0,...,0.0,0.875,0,0.0,0,0,0,0,0.0,0.0


longest_char streak and didgit letter mixing Features

In [82]:
def longest_same_char_streak(pwd):
    """Find longest streak of the same character."""
    pwd = str(pwd)
    if not pwd:
        return 0
    
    max_streak = 1
    current_streak = 1
    
    for i in range(1, len(pwd)):
        if pwd[i] == pwd[i-1]:
            current_streak += 1
            max_streak = max(max_streak, current_streak)
        else:
            current_streak = 1
    
    return max_streak

def digit_letter_mixing_score(pwd):
    """Calculate how mixed digits and letters are (0-1)."""
    pwd = str(pwd)
    
    digit_positions = [i for i, c in enumerate(pwd) if c.isdigit()]
    letter_positions = [i for i, c in enumerate(pwd) if c.isalpha()]
    
    if not digit_positions or not letter_positions:
        return 0.0
    
    # Check if digits and letters are interspersed
    min_digit = min(digit_positions)
    max_digit = max(digit_positions)
    min_letter = min(letter_positions)
    max_letter = max(letter_positions)
    
    # Calculate overlap
    overlap_start = max(min_digit, min_letter)
    overlap_end = min(max_digit, max_letter)
    overlap = max(0, overlap_end - overlap_start)
    
    if overlap <= 0:  # No overlap - completely separated
        return 0.0
    else:  # Some overlap - mixed
        total_span = max(max_digit, max_letter) - min(min_digit, min_letter)
        return overlap / total_span if total_span > 0 else 0.0

In [83]:
df['longest_same_char_streak'] = df['password'].apply(longest_same_char_streak)
df['digit_letter_mixing_score'] = df['password'].apply(digit_letter_mixing_score)

In [84]:
df.head()

Unnamed: 0,password,length,num_letters,num_upper,num_lower,num_digits,num_special_char,has_upper,has_num,has_special,...,num_transitions,transition_frequency,letter_to_digit,digit_to_letter,to_special,from_special,alternating_pattern_score,transitions_to_length_ratio,longest_same_char_streak,digit_letter_mixing_score
0,123456,6,0,0,0,6,0,0,1,0,...,0,0.0,0,0,0,0,0.0,0.0,1,0.0
1,12345,5,0,0,0,5,0,0,1,0,...,0,0.0,0,0,0,0,0.0,0.0,1,0.0
2,123456789,9,0,0,0,9,0,0,1,0,...,0,0.0,0,0,0,0,0.0,0.0,1,0.0
3,password,8,8,0,8,0,0,0,0,0,...,0,0.0,0,0,0,0,0.0,0.0,2,0.0
4,iloveyou,8,8,0,8,0,0,0,0,0,...,0,0.0,0,0,0,0,0.0,0.0,1,0.0
