In [2]:
# Import the Libraries
 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
# Loading the dataset

data = pd.read_excel(r"C:\Users\Shashi\Model Building\Ensemble Models\Ensemble_Password_Strength.xlsx")

In [5]:
# Checking the Head of data

data.head()

Unnamed: 0,characters,characters_strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1


In [6]:
# Importing the Libraries

import warnings
warnings.filterwarnings('ignore')

In [7]:
# Checking the Value Count

data['characters'].value_counts()

kzde5577         1
anit1213         1
eqiyabm41        1
saifur34         1
x1wrw29bw1ojh    1
                ..
94311163nobp     1
hamqrc6          1
luthien123       1
alodise603       1
akosi091692      1
Name: characters, Length: 1999, dtype: int64

In [8]:
# Checking the duplicates

data.duplicated().sum()

0

In [9]:
# Checking the Missing Values

data.isna().sum()

characters             0
characters_strength    0
dtype: int64

In [10]:
data

Unnamed: 0,characters,characters_strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1
...,...,...
1994,patososo2012,1
1995,sf2004824,1
1996,marco90,0
1997,jebekk1,0


In [11]:
# Checking the data types

data.dtypes

characters             object
characters_strength     int64
dtype: object

In [14]:
data = data.astype('string')

In [15]:
# Encoder - LabelEncoder

from sklearn.preprocessing import LabelEncoder

In [16]:
enc = LabelEncoder()

data['characters'] = enc.fit_transform(data['characters'])

In [17]:
data = data.astype('float')

In [18]:
# Standardization - Scaling Min = 0, Max = 1

def norm_fun(i):
    x = (i - i.min())/(i.max() - i.min())
    
    return x

In [19]:
data_norm = norm_fun(data)
data = pd.DataFrame(data_norm)
data.describe()

Unnamed: 0,characters,characters_strength
count,1999.0,1999.0
mean,0.5,0.857929
std,0.288892,0.34921
min,0.0,0.0
25%,0.25,1.0
50%,0.5,1.0
75%,0.75,1.0
max,1.0,1.0


In [20]:
# Checking the data Correlation

data.corr()

Unnamed: 0,characters,characters_strength
characters,1.0,-0.087937
characters_strength,-0.087937,1.0


In [21]:
# Spliting the data

X = data.drop('characters_strength', axis = 1)
Y = data.characters_strength

In [22]:
# Importing the Train Test Split

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [23]:
# Creating the Model

from sklearn import tree

model = tree.DecisionTreeClassifier()
model.fit(x_train, y_train)

# Checking the score of
model.score(X, Y)

0.959479739869935

In [24]:
# Prediction on Test Data
y_pred = model.predict(x_test)
pd.crosstab(y_pred, y_test)

np.mean(y_pred == y_test) # Test Data Accuracy 

0.7975

In [25]:
# Bagging

from sklearn import tree
clftree = tree.DecisionTreeClassifier()
from sklearn.ensemble import BaggingClassifier


bag_clf = BaggingClassifier(base_estimator = clftree, n_estimators = 1000,
                            bootstrap = True, n_jobs = 1, random_state = 42)

bag_clf.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix

# Evaluation on Testing Data
confusion_matrix(y_test, bag_clf.predict(x_test))
accuracy_score(y_test, bag_clf.predict(x_test))

# Evaluation on Training Data
confusion_matrix(y_train, bag_clf.predict(x_train))
accuracy_score(y_train, bag_clf.predict(x_train))

1.0

In [26]:
# Ada Boosting

from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(learning_rate = 0.02, n_estimators = 5000)

ada_clf.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix

# Evaluation on Testing Data
confusion_matrix(y_test, ada_clf.predict(x_test))
accuracy_score(y_test, ada_clf.predict(x_test))

# Evaluation on Training Data
accuracy_score(y_train, ada_clf.predict(x_train))


0.8530331457160726

In [27]:
# Ada Boosting - model 2

from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(learning_rate = 0.5, n_estimators=5000, random_state=0)

ada_clf.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix

# Evaluation on Testing Data
confusion_matrix(y_test, ada_clf.predict(x_test))
accuracy_score(y_test, ada_clf.predict(x_test))

# Evaluation on Training Data
accuracy_score(y_train, ada_clf.predict(x_train))

0.8686679174484052

In [28]:
# Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier

boost_clf = GradientBoostingClassifier()

boost_clf.fit(x_train, y_train)

from sklearn.metrics import accuracy_score, confusion_matrix

confusion_matrix(y_test, boost_clf.predict(x_test))
accuracy_score(y_test, boost_clf.predict(x_test))

0.8725

In [31]:
# xgboost - want output to be numeric

import xgboost as xgb

xgb_clf = xgb.XGBClassifier(max_depths = 5, n_estimators = 10000, learning_rate = 0.3, n_jobs = -1)

# n_jobs – Number of parallel threads used to run xgboost.
# learning_rate (float) – Boosting learning rate (xgb’s “eta”)


xgb_clf.fit(x_train, y_train)

Parameters: { "max_depths" } are not used.



XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.3, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=6, max_depths=5, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=10000,
              n_jobs=-1, num_parallel_tree=1, predictor='auto', ...)

In [32]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Evaluation on Testing Data
confusion_matrix(y_test, xgb_clf.predict(x_test))
accuracy_score(y_test, xgb_clf.predict(x_test))


0.77

In [33]:
# GridsearchCV
xgb_clf = xgb.XGBClassifier(n_estimators = 500, learning_rate = 0.1, random_state = 42)

param_test1 = {'max_depth': range(3,10,2), 'gamma': [0.1, 0.2, 0.3],
               'subsample': [0.8, 0.9], 'colsample_bytree': [0.8, 0,9],
               'reg_alpha': [1e-2, 0.1, 1]}

# Grid Search
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(xgb_clf, param_test1, n_jobs = -1, cv = 5, scoring = 'accuracy')

grid_search.fit(x_train, y_train)

grid_search.best_params_
cv_xg_clf = grid_search.best_estimator_

# Evaluation on Testing Data with model with hyperparameter
accuracy_score(y_test, cv_xg_clf.predict(x_test))



0.8775

In [34]:
### k-Nearest Neighbors (k-NN) with GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

params_knn = {'n_neighbors': np.arange(1, 25)}

knn_gs = GridSearchCV(knn, params_knn, cv = 5)

knn_gs.fit(x_train, y_train)
knn_best = knn_gs.best_estimator_
knn_best

KNeighborsClassifier(n_neighbors=19)

In [35]:
### Random Forest Classifier with GridSearchCV

from sklearn import datasets, linear_model, svm, neighbors, naive_bayes, ensemble

rf = ensemble.RandomForestClassifier(random_state = 0)

params_rf = {'n_estimators': [50, 100, 200]}

rf_gs = GridSearchCV(rf, params_rf, cv = 5)

rf_gs.fit(x_train, y_train)
rf_best = rf_gs.best_estimator_

In [36]:
### Logistic Regression with GridSearchCV
log_reg = linear_model.LogisticRegression(random_state = 123, solver = "liblinear", 
                                          penalty = "l2", max_iter = 5000)
C = np.logspace(1, 4, 10)
params_lr = dict(C = C)

lr_gs = GridSearchCV(log_reg, params_lr, cv = 5, verbose = 0)

lr_gs.fit(x_train, y_train)
lr_best = lr_gs.best_estimator_

In [37]:
# Combine all three Voting Ensembles

from sklearn.ensemble import VotingClassifier

estimators = [('knn', knn_best), ('rf', rf_best), ('log_reg', lr_best)]

# Instantiate the voting classifier
ensemble_H = VotingClassifier(estimators, voting = "hard")

# Fit classifier with the training data
hard_voting = ensemble_H.fit(x_train, y_train)

In [38]:
# Save the voting classifier

import pickle

pickle.dump(hard_voting, open('hard_voting.pkl', 'wb'))

# Loading a saved model
model = pickle.load(open('hard_voting.pkl', 'rb'))
model

VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=19)),
                             ('rf',
                              RandomForestClassifier(n_estimators=50,
                                                     random_state=0)),
                             ('log_reg',
                              LogisticRegression(C=10.0, max_iter=5000,
                                                 random_state=123,
                                                 solver='liblinear'))])

In [39]:
print("knn_gs.score: ", knn_best.score(x_test, y_test))
# Output: knn_gs.score:

print("rf_gs.score: ", rf_best.score(x_test, y_test))
# Output: rf_gs.score:

print("log_reg.score: ", lr_best.score(x_test, y_test))
# Output: log_reg.score:

knn_gs.score:  0.8825
rf_gs.score:  0.8
log_reg.score:  0.8825


In [40]:
print("ensemble.score: ", ensemble_H.score(x_test, y_test))
# Output: ensemble.score: # Majority

ensemble.score:  0.8825


In [41]:
# Soft Voting

# Combine all 3 models using VotingClassifier with voting = "soft" parameter
estimators = [('knn', knn_best), ('rf', rf_best), ('log_reg', lr_best)]

ensemble_S = VotingClassifier(estimators, voting = "soft")

soft_voting = ensemble_S.fit(x_train, y_train)

In [42]:
# Soft Voting

# Save model
pickle.dump(soft_voting, open('soft_voting.pkl', 'wb'))


# Load the saved model
model = pickle.load(open('soft_voting.pkl', 'rb'))
model

VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=19)),
                             ('rf',
                              RandomForestClassifier(n_estimators=50,
                                                     random_state=0)),
                             ('log_reg',
                              LogisticRegression(C=10.0, max_iter=5000,
                                                 random_state=123,
                                                 solver='liblinear'))],
                 voting='soft')

In [43]:
print("knn_gs.score: ", knn_gs.score(x_test, y_test))
# Output: knn_gs.score:

print("rf_gs.score: ", rf_gs.score(x_test, y_test))
# Output: rf_gs.score:

print("log_reg.score: ", lr_gs.score(x_test, y_test))
# Output: log_reg.score:

knn_gs.score:  0.8825
rf_gs.score:  0.8
log_reg.score:  0.8825


In [44]:
print("ensemble.score: ", ensemble_S.score(x_test, y_test))
# Output: ensemble.score: AVG or WT.AVG

ensemble.score:  0.8825


In [45]:
# Import Libraries

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [46]:
# Stacking

# Base estimators

estimators = [('rf', RandomForestClassifier(n_estimators = 10, random_state = 42)),
              ('svr', make_pipeline(StandardScaler(), LinearSVC(random_state = 42)))]

In [47]:
# Meta Model stacked on top of base estimators

clf = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression())

In [48]:
# Fit the model on traing data

stacking = clf.fit(x_train, y_train)

In [49]:
# Accuracy

stacking.score(x_test, y_test)

0.8825

In [50]:
# Save the Stacking model 
pickle.dump(stacking, open('stacking_CharStrength.pkl', 'wb'))

In [51]:
# Load the saved model

model = pickle.load(open('stacking_CharStrength.pkl', 'rb'))
model

StackingClassifier(estimators=[('rf',
                                RandomForestClassifier(n_estimators=10,
                                                       random_state=42)),
                               ('svr',
                                Pipeline(steps=[('standardscaler',
                                                 StandardScaler()),
                                                ('linearsvc',
                                                 LinearSVC(random_state=42))]))],
                   final_estimator=LogisticRegression())

In [52]:
pred = model.predict(x_test)

In [53]:
pred

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [54]:
y_test

405     1.0
1189    1.0
674     1.0
1358    1.0
810     1.0
       ... 
1453    1.0
360     1.0
1809    1.0
769     1.0
563     1.0
Name: characters_strength, Length: 400, dtype: float64

## Result : This model score is more than 87%, By Performing Boosting, Bagging, Voting, Staking with Grid Search Cv, identifying the best score and best parameter, concluding by saying this model is right fit