In [3]:
import pandas as pd
import numpy as np

In [7]:
#importing raw datset
df = pd.read_table(
    'E:/major_project/datasets/rockyou.txt', 
    names=['password'], 
    encoding='latin-1', 
    header=None,
    dtype={'password': str},  # <--- Crucial fix
    on_bad_lines='skip'       # Skips corrupted lines preventing crashes
)

In [6]:
df.head()

Unnamed: 0,password
0,123456
1,12345
2,123456789
3,password
4,iloveyou


In [13]:
#removing unrecognizable characters
import re

def clean_password(p):
    if not isinstance(p,str):
        return None
    #for non-printable characters
    p = re.sub(r'[\x00-\x1F\x7F-\x9F]','',p)
    #for whitespace stripping
    p=p.strip()
    #for empty strings
    return p if len(p)>0 else None

df['password']=df['password'].apply(clean_password)
df=df.dropna()

In [15]:
df.tail(20)

Unnamed: 0,password,strength
14344077,no,0
14344078,maka,0
14344079,jupanu,0
14344080,ciocolatax,0
14344081,angelica,0
14344082,1990,0
14344083,1111,0
14344084,pepe,0
14344085,markinho,0
14344086,mara,0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14344094 entries, 0 to 14344096
Data columns (total 2 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   password  object
 1   strength  int64 
dtypes: int64(1), object(1)
memory usage: 328.3+ MB


In [18]:
#missing values
df.isna().sum()

password    0
strength    0
dtype: int64

In [28]:
import pandas as pd
import numpy as np
from zxcvbn_rs_py import zxcvbn 
from pandarallel import pandarallel

# 1. Setup Speed
pandarallel.initialize(progress_bar=True)

# 2. Define Fast Strength Function
def get_strength(pwd):
    try:
        # zxcvbn-rs is super fast
        return int(zxcvbn(str(pwd)).score)
    except:
        return 0

# 3. Calculate Strength for the whole 1.4M (Should take ~2-5 mins)
print("Calculating Strength for 1.4M passwords...")
df['strength'] = df['password'].parallel_apply(get_strength)

# 4. Stratified Sampling (The "Gold Standard")
print("Balancing the dataset...")

# We want roughly 100k passwords per class (500k total)
# You can increase this to 150000 if your RAM handles it well
TARGET_PER_CLASS = 100000 

balanced_samples = []

for strength_score in range(5):
    # Get all passwords for this specific score
    class_group = df[df['strength'] == strength_score]
    
    count = len(class_group)
    print(f"Class {strength_score}: Found {count} passwords")
    
    if count >= TARGET_PER_CLASS:
        # If we have enough, take a random sample
        balanced_samples.append(class_group.sample(TARGET_PER_CLASS, random_state=42))
    else:
        # If we have fewer (likely Class 4), take ALL of them
        print(f"  -> Taking all {count} from Class {strength_score} (Rare Class)")
        balanced_samples.append(class_group)

# 5. Create Final DataFrame
df_balanced = pd.concat(balanced_samples).reset_index(drop=True)

print(f"Final Dataset Size: {len(df_balanced)} rows")

# 6. Save (So you never have to run this again)
df_balanced.to_csv('E:/major_project/datasets/rockyou_balanced.csv', index=False)
print("Saved to 'rockyou_balanced.csv'")

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/
Calculating Strength for 1.4M passwords...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3586024), Label(value='0 / 3586024…

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/
Calculating Strength for 1.4M passwords...


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3586024), Label(value='0 / 3586024…

Balancing the dataset...
Class 0: Found 40455 passwords
  -> Taking all 40455 from Class 0 (Rare Class)
Class 1: Found 5175459 passwords
Class 2: Found 5161741 passwords
Class 3: Found 3019536 passwords
Class 4: Found 946903 passwords
Final Dataset Size: 440455 rows
Saved to 'rockyou_balanced.csv'


In [29]:
df['strength'].value_counts()

strength
1    5175459
2    5161741
3    3019536
4     946903
0      40455
Name: count, dtype: int64

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14344094 entries, 0 to 14344096
Data columns (total 2 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   password  object
 1   strength  int64 
dtypes: int64(1), object(1)
memory usage: 328.3+ MB


Now in next, we will work with all 14 million data