# 1. read data from SQL Database !

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import sqlite3

In [3]:
con = sqlite3.connect(r"R:\\Data Science\\ZDS\\Projects\\Predict Password Strength\\password_data.sqlite")

In [4]:
data = pd.read_sql_query("SELECT * FROM Users" , con)

In [5]:
data.shape

In [6]:
data.head(4)

# 2. Doing basic data cleaning !

In [7]:
data.columns

In [8]:
data.drop(["index"] ,axis=1 , inplace=True)

In [9]:
data.head(4)

### Check duplicate rows

In [10]:
data.duplicated().sum()

### Checking missing values in columns

In [11]:
data.isnull().any()

In [12]:
data.isnull().any().sum()

### Check dat_types of every feature

In [13]:
data.dtypes

### checking whether "strength" feature has ir-relevant values or not 

In [14]:
data["strength"]

In [15]:
# uniuqe() it is used for ir-relavante function
data["strength"].unique()

# 3. Performing Semantic Analysis 

In [16]:
data.columns

In [17]:
type(data["password"][0])

### A. How many password textual actually holds only numeric characters

In [18]:
data["password"].str.isnumeric()

In [19]:
data[data["password"].str.isnumeric()]

In [20]:
data[data["password"].str.isnumeric()].shape

### B. How many password textual actually holds only Upper-case character

In [21]:
data[data["password"].str.isupper()]

### C. How many password textual actually holds only alphabet

In [22]:
data[data["password"].str.isupper()].shape

In [23]:
data[data["password"].str.isalpha()]

In [24]:
data[data["password"].str.isalpha()].shape

### D. How many password textual actually holds alpha-numeric character 

In [25]:
data[data["password"].str.isalnum()]

In [26]:
data[data["password"].str.isalnum()].shape

### E. How many password textual actually holds title-case character

In [27]:
data[data["password"].str.istitle()]

In [28]:
data[data["password"].str.istitle()].shape

### F. How many password textual actually holds special character

In [29]:
data["password"]

In [30]:
import string

In [31]:
string.punctuation

In [32]:
def find_semantics(row):
    for char in row:
        if char in string.punctuation:
            return 1
        else:
            pass

In [33]:
data["password"].apply(find_semantics)==1

In [34]:
data[data["password"].apply(find_semantics)==1]

In [35]:
data[data["password"].apply(find_semantics)==1].shape

# 4. Applying Feature Engineering

### length of every Password

In [36]:
data["password"][0]

In [37]:
len(data["password"][0])

In [38]:
data["password"].str.len()

In [39]:
data["length"] = data["password"].str.len()

### Frequency of Lowercase Characters

In [40]:
password = "Shan99"

In [41]:
[char for char in password if char.islower()]

In [42]:
len([char for char in password if char.islower()])

In [43]:
len([char for char in password if char.islower()])/len(password)

### Frequency of lowercase Characters 

In [44]:
def freq_lowercase(row):
    return len([char for char in row if char.islower()])/len(row)

### Frequency of Uppercase Characters

In [45]:
def freq_uppercase(row):
    return len([char for char in row if char.isupper()])/len(row)

### Frequency of Numeric Characters 

In [46]:
def freq_numerical_case(row):
    return len([char for char in row if char.isdigit()])/len(row)

In [47]:
data["lowercase_freq"] = np.round(data["password"].apply(freq_lowercase) , 3)

data["uppercase_freq"] = np.round(data["password"].apply(freq_uppercase) , 3)

data["digit_freq"] = np.round(data["password"].apply(freq_numerical_case) , 3)

In [48]:
data.head(3)

### Frequency of Special-case Characters

In [49]:
def freq_special_case(row):
    special_chars = []
    for char in row:
        if not char.isalpha() and not char.isdigit():
            special_chars.append(char)
    return len(special_chars)

In [50]:
data["special_char_freq"] = np.round(data["password"].apply(freq_special_case) , 3)

In [51]:
data.head(5)

In [52]:
data["special_char_freq"]/data["length"]

In [53]:
data["special_char_freq"] = data["special_char_freq"]/data["length"]

In [54]:
data.head(5)

# 5. Performing Descriptive Statistics

In [55]:
data.columns

In [56]:
data[['length' , 'strength']]

In [57]:
data[['length' , 'strength']].groupby(['strength']).agg(["min" , "max", "mean" , "median"])

In [58]:
cols = ['length', 'lowercase_freq', 'uppercase_freq',
       'digit_freq', 'special_char_freq']
for col in cols:
    print(col)
    print(data[[col , 'strength']].groupby(['strength']).agg(["min" , "max", "mean" , "median"]))
    print('\n')

In [59]:
data.columns

In [60]:
fig , ((ax1 , ax2) , (ax3 , ax4) , (ax5 , ax6)) = plt.subplots(3 , 2)

sns.boxplot(x="strength" , y='length' , hue="strength" , ax=ax1 , data=data)
sns.boxplot(x="strength" , y='lowercase_freq' , hue="strength" , ax=ax2 , data=data)
sns.boxplot(x="strength" , y='uppercase_freq' , hue="strength" , ax=ax3 , data=data)
sns.boxplot(x="strength" , y='digit_freq' , hue="strength" , ax=ax4 , data=data)
sns.boxplot(x="strength" , y='special_char_freq' , hue="strength" , ax=ax5 , data=data)

In [61]:
fig , ((ax1 , ax2) , (ax3 , ax4) , (ax5 , ax6)) = plt.subplots(3 , 2 , figsize=(15,7))

sns.boxplot(x="strength" , y='length' , hue="strength" , ax=ax1 , data=data)
sns.boxplot(x="strength" , y='lowercase_freq' , hue="strength" , ax=ax2 , data=data)
sns.boxplot(x="strength" , y='uppercase_freq' , hue="strength" , ax=ax3 , data=data)
sns.boxplot(x="strength" , y='digit_freq' , hue="strength" , ax=ax4 , data=data)
sns.boxplot(x="strength" , y='special_char_freq' , hue="strength" , ax=ax5 , data=data)

plt.subplots_adjust(hspace=0.6)

# 6.. Feature Importance  ( How to figure out imp. feature )

In [62]:
data.columns

In [63]:
  def get_dist(data , feature):
        
        plt.figure(figsize=(10,8))
        plt.subplot(1,2,1)
        
        sns.violinplot(x='strength' , y=feature , data=data)
        
        plt.subplot(1,2,2)
        
        sns.distplot(data[data['strength']==0][feature] , color="red" , label="0" , hist=False)
        sns.distplot(data[data['strength']==1][feature] , color="blue" , label="1" , hist=False)
        sns.distplot(data[data['strength']==2][feature] , color="orange" , label="2" , hist=False)
        plt.legend()
        plt.show()

In [64]:
import warnings
from warnings import filterwarnings
filterwarnings("ignore")

In [65]:
get_dist(data , "length" )

In [66]:
data.columns

In [67]:
get_dist(data , 'lowercase_freq')

In [68]:
get_dist(data , 'uppercase_freq')

In [69]:
get_dist(data , 'digit_freq')

In [70]:
get_dist(data , 'special_char_freq')

# 7.. Applying TF_IDF on data

In [71]:
data.head(4)

In [72]:
data

In [73]:
dataframe = data.sample(frac=1)

In [74]:
dataframe

In [75]:
x = list(dataframe["password"])

In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [77]:
vectorizer = TfidfVectorizer(analyzer="char")

In [78]:
X = vectorizer.fit_transform(x)

In [79]:
X.shape

In [80]:
dataframe["password"].shape

In [81]:
X

In [82]:
X.toarray()

In [83]:
X.toarray()[0]

In [84]:
dataframe["password"]

In [85]:
len(vectorizer.get_feature_names_out())

In [86]:
vectorizer.get_feature_names_out()

In [87]:
df2 = pd.DataFrame(X.toarray() , columns=vectorizer.get_feature_names_out())

In [88]:
df2

# 8. Applying Machine Learning algorithm

In [89]:
df2["length"] = dataframe["length"]
df2["lowercase_freq"] = dataframe["lowercase_freq"]

In [90]:
df2

In [91]:
y = dataframe["strength"]

### split data into train & test

In [92]:
 from sklearn.model_selection import train_test_split

In [93]:
X_train, X_test, y_train, y_test = train_test_split(df2, y, test_size=0.20)

In [94]:
X_train.shape

In [95]:
y_train.shape

In [96]:
from sklearn.linear_model import LogisticRegression

In [97]:
clf = LogisticRegression(multi_class="multinomial")

In [98]:
clf.fit(X_train , y_train)

In [99]:
y_pred = clf.predict(X_test)

In [100]:
y_pred

In [101]:
from collections import Counter

In [102]:
Counter(y_pred)

# 9. Doing prediction on sample data (user-entered input)

In [103]:
password = "%@123abcd"

In [104]:
sample_array = np.array([password])

In [105]:
sample_matrix = vectorizer.transform(sample_array)

In [106]:
sample_matrix.toarray()

In [107]:
sample_matrix.toarray().shape

In [108]:
password

In [109]:
len(password)

In [110]:
[char for char in password if char.islower()]

In [111]:
len([char for char in password if char.islower()])/len(password)

In [112]:
np.append(sample_matrix.toarray() ,(9,0.444))

In [113]:
np.append(sample_matrix.toarray() ,(9,0.444)).shape

In [114]:
np.append(sample_matrix.toarray() ,(9,0.444)).reshape(1,101)

In [115]:
np.append(sample_matrix.toarray() ,(9,0.444)).reshape(1,101).shape

In [116]:
new_matrix = np.append(sample_matrix.toarray() ,(9,0.444)).reshape(1,101)

In [117]:
clf.predict(new_matrix)

In [118]:
def predict():
    password = input("Enter a password : ")
    sample_array = np.array([password])
    sample_matrix = vectorizer.transform(sample_array)
    
    length_pass = len(password)
    length_normalised_lowercase = len([char for char in password if char.islower()])/len(password)
    
    new_matrix2 = np.append(sample_matrix.toarray() , (length_pass , length_normalised_lowercase)).reshape(1,101)
    result = clf.predict(new_matrix2)
    
    if result == 0 :
        return "Password is weak"
    elif result == 1 :
        return "Password is normal"
    else:
        return "password is strong"

In [119]:
predict()

Enter a password : ssssss


In [120]:
predict()

Enter a password : sssssssssssss


# 10.. Model evaluation 

In [125]:
from sklearn.metrics import confusion_matrix , accuracy_score , classification_report

In [126]:
accuracy_score(y_test , y_pred)

In [127]:
confusion_matrix(y_test , y_pred)

In [128]:
print(classification_report(y_test , y_pred))