# reading dataset

In [2]:
import pandas as pd
import re

In [3]:
df = pd.read_csv(r"E:\major_project\dataset\datasets\sampled_df.csv")

In [4]:
df

Unnamed: 0,password,length,bucket,strength
0,agus,4,0-4,1.0
1,roar,4,0-4,1.0
2,malz,4,0-4,1.0
3,cuba,4,0-4,1.0
4,pcms,4,0-4,1.0
...,...,...,...,...
299995,changepassword___,17,17+,3.0
299996,stupidmotherfucker,18,17+,2.0
299997,luistekelounresto,17,17+,4.0
299998,themontles@yahoo.com,20,17+,4.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   password  299997 non-null  object 
 1   length    300000 non-null  int64  
 2   bucket    300000 non-null  object 
 3   strength  299995 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 9.2+ MB


In [6]:
df.describe()

Unnamed: 0,length,strength
count,300000.0,299995.0
mean,9.146027,2.068358
std,3.312255,1.010141
min,1.0,0.0
25%,7.0,1.0
50%,8.5,2.0
75%,10.0,3.0
max,96.0,4.0


while cleaning the data sets, we have to look for duplicates, NaN values, empty value, unredeable characters, and valid length fo passwords.

# removing duplicates and empty passwords

In [9]:
#checking for duplicates
df.duplicated().sum()

27

In [10]:
df[df.duplicated()]

Unnamed: 0,password,length,bucket,strength
967,stfu,4,0-4,1.0
1002,bill,4,0-4,0.0
4506,jack,4,0-4,0.0
5849,333,3,0-4,0.0
6007,7,1,0-4,0.0
6068,love,4,0-4,0.0
6359,jonh,4,0-4,1.0
6543,,3,0-4,0.0
7639,fuck,4,0-4,0.0
7933,123,3,0-4,0.0


Here, we found NaN in the duplicated data. the amount of duplicated data we have is very minute in comparison to the total dataset we have. so we will drop all the duplicates, which will not affect our training.


In [12]:
#inplace is true since we dont need a new df
df.drop_duplicates(inplace =True)

In [13]:
df.duplicated().sum()

0

In [14]:
df.isnull().sum()

password    1
length      0
bucket      0
strength    5
dtype: int64

In [15]:
#ensuring there are no empty passwords in df
df=df[df['password'].str.len() > 0].copy()

#copy() is used to work in same df, else separate df is formed

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 299972 entries, 0 to 299999
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   password  299972 non-null  object 
 1   length    299972 non-null  int64  
 2   bucket    299972 non-null  object 
 3   strength  299967 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 11.4+ MB


In [17]:
#dropping NaN password and strength found previously
df=df.dropna(subset=['password'])

In [18]:
df=df.dropna(subset=['strength'])

In [19]:
#checking for NaN again
df.isnull().sum()

password    0
length      0
bucket      0
strength    0
dtype: int64

# cleaaning unredeable characters

In [21]:
mask_unreadable = df['password'].str.contains(r'[^\x20-\x7E]', na=False)
mask_unreadable.sum()

584

In [22]:
#checking the type of values we found
df[mask_unreadable].tail(50)

Unnamed: 0,password,length,bucket,strength
298392,à¸à¸-à¸ à¸à¸ /à¸à¹à¸,20,17+,4.0
298410,à¸/-à¸¸à¸à¸à¸¶à¸à¸¶à¸à¸¸à¸à¹à¸,30,17+,4.0
298412,à¸¢à¸à¸³à¸à¸à¸²à¸·à¸µà¸·,24,17+,4.0
298458,à¸¢à¸¸à¸à¸ /à¸ à¸à¸¸,20,17+,4.0
298472,à¸à¸ à¸à¸à¸à¸à¸¶à¸à¸¶,21,17+,4.0
298565,à¸£à¸ªà¸à¸­à¸³à¸ªà¸³à¸,22,17+,4.0
298582,05377178831komikazebarÄ±s,25,17+,4.0
298683,à¸ªà¸à¸­à¸³à¸à¸µà¸à¸à¸·à¸ªà¸±,29,17+,4.0
298686,Ã²Ã²Ã²Ã²Ã²Ã²Ã²Ã²Ã²Ã²Ã²Ã²Ã²Ã²Ã²Ã²Ã²Ã²Ã²Ã²Ã²Ã²Ã²Ã²,48,17+,1.0
298717,miamammaÃ¨isterica,18,17+,4.0


here we can see that characters such as"â¬, Â¬, Ã¸" are corrupted. But some are unicode of languages such as thai, arabic etc. so we will try to only remove the corrupt passwords, since it will be difficult and unaccurate to guess the original form of them.

In [24]:
#we make a list of corrupted characters produced from UTF-8 to latin-1
corrupted_char = [    'Ã', 'Â', 'â', 'Å', 'Ä', 'Æ', 
    'Ø', 'Ö', 'Ô', 'Õ', 
    'Û', 'Ù', 'Ú', 'Ü', 'Ý', 'Ÿ',
    'þ', 'ÿ','Ã±', 'Ã¡', 'Ã©', 'Ã³', 'Ãº', 'Ã¼', 'Ã§', 'Ã¨',
    'Â±', 'Â¬', 'Â£', 'Â®', 'Â©', 'Â¢',
    'â™', 'âœ', 'â€“', 'â€',]

#~ is used to negate the value
mask =~df['password'].str.contains('|'.join(map(re.escape,corrupted_char)), na=False)

#saving in a separate df
df_clean=df[mask].copy()

In [25]:
#checking again
mask_unreadable_v2 = df_clean['password'].str.contains(r'[^\x20-\x7E]', na=False)
mask_unreadable_v2.sum()

306

In [26]:
df_clean[mask_unreadable_v2]

Unnamed: 0,password,length,bucket,strength
2221,à¹à¹,4,0-4,0.0
6274,××××,4,0-4,0.0
39658,à¹/--/à¹,8,5-8,2.0
42111,montaña,7,5-8,2.0
85437,× ××¢×,6,5-8,1.0
...,...,...,...,...
299859,à¸à¸¶à¸à¸à¸¶à¸¸à¹à¸à¹,21,17+,4.0
299882,à¸ à¸¶-à¸à¸¸à¸à¹à¸à¸à¸,22,17+,4.0
299928,à¸à¸à¸ à¸¶à¸ à¸à¸//à¸¸,22,17+,4.0
299940,à¸à¸¢à¸à¸ªà¹à¸à¸à¸à¸²à¸à¸2,26,17+,4.0


# filtering lengths above 50 since they are unrealistic

In [28]:
df_clean = df_clean[df_clean['length']<=50].copy()
df_clean

Unnamed: 0,password,length,bucket,strength
0,agus,4,0-4,1.0
1,roar,4,0-4,1.0
2,malz,4,0-4,1.0
3,cuba,4,0-4,1.0
4,pcms,4,0-4,1.0
...,...,...,...,...
299995,changepassword___,17,17+,3.0
299996,stupidmotherfucker,18,17+,2.0
299997,luistekelounresto,17,17+,4.0
299998,themontles@yahoo.com,20,17+,4.0


# Addition of new features


for doing exploratary data analysis, we should begin with adding a few extra columns, which will help in our EDA, and understanding how strength has been added. 

Addition of 11 new features will be done sequentially

In [32]:
import pandas as pd
import numpy as np
import re
import math

# adding character counts

In [34]:
#adding lowercase feature
df_clean["lowercase_count"]=df_clean["password"].str.count(r'[a-z]')

#counting uppercase characters
df_clean["uppercase_count"]=df_clean["password"].str.count(r'[A-Z]')

#counting special characters
df_clean["special_count"]=df_clean["password"].str.count(r'[^a-zA-Z0-9]')

#counting digits
df_clean["digit_count"]=df_clean["password"].str.count(r'[a-z]')

In [35]:
#checking new columns
df_clean

Unnamed: 0,password,length,bucket,strength,lowercase_count,uppercase_count,special_count,digit_count
0,agus,4,0-4,1.0,4,0,0,4
1,roar,4,0-4,1.0,4,0,0,4
2,malz,4,0-4,1.0,4,0,0,4
3,cuba,4,0-4,1.0,4,0,0,4
4,pcms,4,0-4,1.0,4,0,0,4
...,...,...,...,...,...,...,...,...
299995,changepassword___,17,17+,3.0,14,0,3,14
299996,stupidmotherfucker,18,17+,2.0,18,0,0,18
299997,luistekelounresto,17,17+,4.0,17,0,0,17
299998,themontles@yahoo.com,20,17+,4.0,18,0,2,18


In [36]:
df_clean['special_count'].value_counts()

special_count
0     275800
1      15277
2       5183
3       1881
4        693
5        172
6        151
7         71
8         59
9         31
18        30
20        26
19        25
10        23
22        21
15        21
12        20
21        19
16        18
17        17
13        16
23        16
14        14
11        12
25        10
24         9
26         7
29         5
27         5
30         5
31         4
35         4
33         4
32         2
28         2
47         1
41         1
37         1
44         1
46         1
34         1
38         1
39         1
Name: count, dtype: int64

# adding flags, char_diversity and unique_chars

In [38]:
#if there are special characters, value is 1 else 0
df_clean["has_special"]=(df_clean['special_count']>0).astype(int)

#if there are digits,value is 1 else 0
df_clean["has_digit"]=(df_clean['digit_count']>0).astype(int)

df_clean

Unnamed: 0,password,length,bucket,strength,lowercase_count,uppercase_count,special_count,digit_count,has_special,has_digit
0,agus,4,0-4,1.0,4,0,0,4,0,1
1,roar,4,0-4,1.0,4,0,0,4,0,1
2,malz,4,0-4,1.0,4,0,0,4,0,1
3,cuba,4,0-4,1.0,4,0,0,4,0,1
4,pcms,4,0-4,1.0,4,0,0,4,0,1
...,...,...,...,...,...,...,...,...,...,...
299995,changepassword___,17,17+,3.0,14,0,3,14,1,1
299996,stupidmotherfucker,18,17+,2.0,18,0,0,18,0,1
299997,luistekelounresto,17,17+,4.0,17,0,0,17,0,1
299998,themontles@yahoo.com,20,17+,4.0,18,0,2,18,1,1


In [39]:
#unique_chars means the no. of unique values in a password
#for e.g. "a!122" has 4 unique values (a,!,1,2)

df_clean["unique_chars"] = df_clean['password'].apply(lambda x:len(set(x)))

#char_diversity combines lowercase,uppercase,digit and special chars
#if one of them is present, value of char_diversity increases

df_clean["char_diversity"]=(
    (df_clean['lowercase_count']>0).astype(int)+
    (df_clean['uppercase_count']>0).astype(int)+
    (df_clean['digit_count']>0).astype(int)+
    (df_clean['special_count']>0).astype(int)
)


In [40]:
df_clean

Unnamed: 0,password,length,bucket,strength,lowercase_count,uppercase_count,special_count,digit_count,has_special,has_digit,unique_chars,char_diversity
0,agus,4,0-4,1.0,4,0,0,4,0,1,4,2
1,roar,4,0-4,1.0,4,0,0,4,0,1,3,2
2,malz,4,0-4,1.0,4,0,0,4,0,1,4,2
3,cuba,4,0-4,1.0,4,0,0,4,0,1,4,2
4,pcms,4,0-4,1.0,4,0,0,4,0,1,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...
299995,changepassword___,17,17+,3.0,14,0,3,14,1,1,13,3
299996,stupidmotherfucker,18,17+,2.0,18,0,0,18,0,1,14,2
299997,luistekelounresto,17,17+,4.0,17,0,0,17,0,1,10,2
299998,themontles@yahoo.com,20,17+,4.0,18,0,2,18,1,1,13,3


# adding entropy,maxm repeat sequence and sequential patterns with their functions

In [42]:
#functions

#for entropy
def pass_entropy(x):
    if not x:
        return 0
    #counting the frequency of unique characters
    freq = pd.Series(list(x)).value_counts()
    prob = freq/len(x)
    #shanon entropy formula
    return -np.sum(prob*np.log2(prob))

# for sequential patterns ( such as abc, qwerty)
def has_sequence(x):
    x_lower=x.lower()
    sequence = [
        'abcdefghijklmnopqrstuvwxyz',
        '0123456789',
        'qwertyuiopasdfghjklzxcvbnm'
    ]
    for seq in sequence:
        for i in range(len(seq) - 2):
            if seq[i:i+3] in x_lower:
                return 1
    return 0

#for maximum repeated character in a password
def max_repeated(x):
    if not x:
        return 0
    return max((len(m.group(0)) for m in re.finditer(r'(.)\1*', x)))
    

In [43]:
#entropy
df_clean["entropy"] = df_clean['password'].apply(pass_entropy)

#sequential patterns
df_clean['seq_letters'] = df_clean['password'].apply(has_sequence)

#maximum repeated character count in a password
df_clean['max_repeat'] = df_clean['password'].apply(max_repeated)


KeyboardInterrupt: 

In [None]:
df_clean

In [None]:
df_clean.to_csv(r"E:\major_project\dataset\datasets\password_features.csv")