# importing pandas 


In [2]:
import pandas as pd

# loading rockyou.txt dataset

In [4]:
df = pd.read_table('E:/major_project/dataset/data/rockyou.txt',names=['password'],encoding='latin-1',header=None)

In [5]:
df

Unnamed: 0,password
0,123456
1,12345
2,123456789
3,password
4,iloveyou
...,...
14344092,"xCvBnM,"
14344093,ie168
14344094,abygurl69
14344095,a6_123


# brief overlook on the dataset

In [37]:
#the type fo data in the dataset.Only passwords have been extracted in this phase
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14344097 entries, 0 to 14344096
Data columns (total 1 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   password  object
dtypes: object(1)
memory usage: 109.4+ MB


In [39]:
df.head()

Unnamed: 0,password
0,123456
1,12345
2,123456789
3,password
4,iloveyou


In [45]:
#last 5 values of the dataset
df.tail()

Unnamed: 0,password
14344092,"xCvBnM,"
14344093,ie168
14344094,abygurl69
14344095,a6_123
14344096,*7Â¡Vamos!


In [55]:
import re

def clean_password(p):
    if not isinstance(p,str):
        return None
    #for non-printable characters
    p = re.sub(r'[\x00-\x1F\x7F-\x9F]','',p)
    #for whitespace stripping
    p=p.strip()
    #for empty strings
    return p if len(p)>0 else None

df['password']=df['password'].apply(clean_password)
df=df.dropna()

checking if the weird values have been removed

In [61]:
df.tail(20)

Unnamed: 0,password
14344077,no
14344078,maka
14344079,jupanu
14344080,ciocolatax
14344081,angelica
14344082,1990
14344083,1111
14344084,pepe
14344085,markinho
14344086,mara


In [57]:
#missing values
df.isna().sum()

password    0
dtype: int64

# sampling data

for sampling data, we need to sample with precision, so majority types of data will be included (length of the data). random sampling may exclude passwords with higher length or lower length. so to make it equal, we first check the length of each password, then add it to continuous intervals. after this, we can decide the number of samples per each interval, or bucket in this case.
this also has to be done due to the nature of the rock you dataset, which has fewer long passwords, and more medium length passwords.

In [69]:
# adding a length column to sample passwords of different length accurately

df['length']=df['password'].astype(str).str.len()

In [73]:
#checking columnn
df

Unnamed: 0,password,length
0,123456,6
1,12345,5
2,123456789,9
3,password,8
4,iloveyou,8
...,...,...
14344092,"xCvBnM,",7
14344093,ie168,5
14344094,abygurl69,9
14344095,a6_123,6


In [81]:
#creating length buckets

bins=[0,4,8,12,16,100]
labels=["0-4","5-8", "9-12", "13-16", "17+"]

df["bucket"]=pd.cut(df["length"],bins=bins, labels=labels, include_lowest=True)
df

Unnamed: 0,password,length,bucket
0,123456,6,5-8
1,12345,5,5-8
2,123456789,9,9-12
3,password,8,5-8
4,iloveyou,8,5-8
...,...,...,...
14344092,"xCvBnM,",7,5-8
14344093,ie168,5,5-8
14344094,abygurl69,9,9-12
14344095,a6_123,6,5-8


In [85]:
#deciding the number of samples per interval
#creating a dictionary with required samples from each category to stimulate the rock you set
target_sizes={
    "0-4": 10000,
    "5-8": 140000,
    "9-12": 110000,
    "13-16": 30000,
    "17+": 10000
}

In [87]:
samples = []

for bucket, size in target_sizes.items():
    group = df[df["bucket"] == bucket]

    if len(group) >= size:
        samples.append(group.sample(size, random_state=42))
    else:
        samples.append(group)

sampled_df = pd.concat(samples).reset_index(drop=True)


In [89]:
sampled_df

Unnamed: 0,password,length,bucket
0,agus,4,0-4
1,roar,4,0-4
2,malz,4,0-4
3,cuba,4,0-4
4,pcms,4,0-4
...,...,...,...
299995,changepassword___,17,17+
299996,stupidmotherfucker,18,17+
299997,luistekelounresto,17,17+
299998,themontles@yahoo.com,20,17+


# adding strength label using zxcvbn

We will be now using a python library zxcvbn to add strength to each password we have sampled. Since the sampling we have done is 300k, we haved used another library know as tqdm to know the rate of progress.

In [16]:
pip install zxcvbn

Note: you may need to restart the kernel to use updated packages.


In [91]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [93]:
from zxcvbn import zxcvbn
from tqdm import tqdm

In [95]:
tqdm.pandas()

def get_strength(password):
    try:
        result = zxcvbn(password)
        return result["score"]
    except:
        return None

sampled_df["strength"]=sampled_df["password"].progress_apply(get_strength)

100%|████████████████████████████████████████████████████████████████████████| 300000/300000 [02:42<00:00, 1848.58it/s]


In [98]:
sampled_df

Unnamed: 0,password,length,bucket,strength
0,agus,4,0-4,1.0
1,roar,4,0-4,1.0
2,malz,4,0-4,1.0
3,cuba,4,0-4,1.0
4,pcms,4,0-4,1.0
...,...,...,...,...
299995,changepassword___,17,17+,3.0
299996,stupidmotherfucker,18,17+,2.0
299997,luistekelounresto,17,17+,4.0
299998,themontles@yahoo.com,20,17+,4.0


In [108]:
sampled_df["strength"].value_counts()

strength
1.0    102678
2.0     98216
3.0     63857
4.0     32454
0.0      2790
Name: count, dtype: int64

# saving file as csv

In [113]:
sampled_df.to_csv(r"E:\major_project\dataset\sampled_df.csv",index=False)