This for the data set which is downloaded from online. from the website "https://www.cs.cmu.edu/~keystroke/"

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Load the merged dataset
merged_df = pd.read_csv('DSL-StrongPasswordData.csv')

# Separate features and target
X = merged_df.drop(columns=['subject', 'sessionIndex', 'rep'])  # Features
y = merged_df['subject']  # Target

In [2]:
print("Data Frame dimensions:", merged_df.shape)
print("X_train dimensions:", X.shape)
print("y_train dimensions:", y.shape)

Data Frame dimensions: (20400, 34)
X_train dimensions: (20400, 31)
y_train dimensions: (20400,)


In [3]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [4]:
from sklearn.utils import shuffle

# Shuffle the training and test sets
X_train, y_train = shuffle(X_train, y_train, random_state=42)
X_test, y_test = shuffle(X_test, y_test, random_state=42)

In [5]:
# Print dimensions of the datasets
print("X_train dimensions:", X_train.shape)
print("y_train dimensions:", y_train.shape)
print("X_test dimensions:", X_test.shape)
print("y_test dimensions:", y_test.shape)

X_train dimensions: (14280, 31)
y_train dimensions: (14280,)
X_test dimensions: (6120, 31)
y_test dimensions: (6120,)


## Random Forest

In [6]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

In [7]:
# Calculate metrics for training set
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred, average='macro')
train_recall = recall_score(y_train, y_train_pred, average='macro')
train_f1 = f1_score(y_train, y_train_pred, average='macro')
train_confusion = confusion_matrix(y_train, y_train_pred)
train_classification_report = classification_report(y_train, y_train_pred)

# Calculate metrics for test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='macro')
test_recall = recall_score(y_test, y_test_pred, average='macro')
test_f1 = f1_score(y_test, y_test_pred, average='macro')
test_confusion = confusion_matrix(y_test, y_test_pred)
test_classification_report = classification_report(y_test, y_test_pred)

In [8]:
# Print training metrics
print("Training Metrics:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall: {train_recall:.4f}")
print(f"F1-Score: {train_f1:.4f}")
print("Confusion Matrix:\n", train_confusion)
print("Classification Report:\n", train_classification_report)

Training Metrics:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
Confusion Matrix:
 [[280   0   0 ...   0   0   0]
 [  0 280   0 ...   0   0   0]
 [  0   0 280 ...   0   0   0]
 ...
 [  0   0   0 ... 280   0   0]
 [  0   0   0 ...   0 280   0]
 [  0   0   0 ...   0   0 280]]
Classification Report:
               precision    recall  f1-score   support

        s002       1.00      1.00      1.00       280
        s003       1.00      1.00      1.00       280
        s004       1.00      1.00      1.00       280
        s005       1.00      1.00      1.00       280
        s007       1.00      1.00      1.00       280
        s008       1.00      1.00      1.00       280
        s010       1.00      1.00      1.00       280
        s011       1.00      1.00      1.00       280
        s012       1.00      1.00      1.00       280
        s013       1.00      1.00      1.00       280
        s015       1.00      1.00      1.00       280
        s016       1.00      1.

In [9]:
# Print test metrics
print("\nTest Metrics:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1-Score: {test_f1:.4f}")
print("Confusion Matrix:\n", test_confusion)
print("Classification Report:\n", test_classification_report)


Test Metrics:
Accuracy: 0.9292
Precision: 0.9302
Recall: 0.9292
F1-Score: 0.9287
Confusion Matrix:
 [[ 97   0   0 ...   0   0   0]
 [  0 109   1 ...   0   0   0]
 [  0   0 108 ...   0   0   0]
 ...
 [  1   0   0 ... 115   2   0]
 [  0   0   0 ...   0 109   1]
 [  0   0   0 ...   0   0  97]]
Classification Report:
               precision    recall  f1-score   support

        s002       0.82      0.81      0.82       120
        s003       0.96      0.91      0.94       120
        s004       0.90      0.90      0.90       120
        s005       0.90      0.95      0.93       120
        s007       0.88      0.82      0.85       120
        s008       0.91      0.89      0.90       120
        s010       0.94      0.97      0.96       120
        s011       0.97      0.87      0.92       120
        s012       0.93      0.93      0.93       120
        s013       0.97      0.95      0.96       120
        s015       0.95      0.90      0.92       120
        s016       0.90      0.98 

import joblib

Save the model 
joblib.dump(rf_model, 'rf_modeldata.joblib')

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.utils import shuffle
import joblib

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), 
                           param_grid=param_grid, 
                           cv=3, 
                           scoring='accuracy', 
                           n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_test_pred = best_model.predict(X_test)
y_train_pred = best_model.predict(X_train)

# Calculate metrics for training set
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred, average='macro')
train_recall = recall_score(y_train, y_train_pred, average='macro')
train_f1 = f1_score(y_train, y_train_pred, average='macro')
train_confusion = confusion_matrix(y_train, y_train_pred)
train_classification_report = classification_report(y_train, y_train_pred)

# Calculate metrics for test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='macro')
test_recall = recall_score(y_test, y_test_pred, average='macro')
test_f1 = f1_score(y_test, y_test_pred, average='macro')
test_confusion = confusion_matrix(y_test, y_test_pred)
test_classification_report = classification_report(y_test, y_test_pred)

324 fits failed out of a total of 972.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
201 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\satvi\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\satvi\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\satvi\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\satvi\AppData\Local\Programs\Python\Python310\l

In [11]:
# Print training metrics
print("Training Metrics:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall: {train_recall:.4f}")
print(f"F1-Score: {train_f1:.4f}")
print("Confusion Matrix:\n", train_confusion)
print("Classification Report:\n", train_classification_report)

Training Metrics:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
Confusion Matrix:
 [[280   0   0 ...   0   0   0]
 [  0 280   0 ...   0   0   0]
 [  0   0 280 ...   0   0   0]
 ...
 [  0   0   0 ... 280   0   0]
 [  0   0   0 ...   0 280   0]
 [  0   0   0 ...   0   0 280]]
Classification Report:
               precision    recall  f1-score   support

        s002       1.00      1.00      1.00       280
        s003       1.00      1.00      1.00       280
        s004       1.00      1.00      1.00       280
        s005       1.00      1.00      1.00       280
        s007       1.00      1.00      1.00       280
        s008       1.00      1.00      1.00       280
        s010       1.00      1.00      1.00       280
        s011       1.00      1.00      1.00       280
        s012       1.00      1.00      1.00       280
        s013       1.00      1.00      1.00       280
        s015       1.00      1.00      1.00       280
        s016       1.00      1.

In [12]:
# Print test metrics
print("\nTest Metrics:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1-Score: {test_f1:.4f}")
print("Confusion Matrix:\n", test_confusion)
print("Classification Report:\n", test_classification_report)


Test Metrics:
Accuracy: 0.9351
Precision: 0.9363
Recall: 0.9351
F1-Score: 0.9346
Confusion Matrix:
 [[ 95   0   0 ...   0   0   0]
 [  0 110   2 ...   0   0   0]
 [  0   0 108 ...   0   0   0]
 ...
 [  2   0   0 ... 115   2   0]
 [  0   0   0 ...   0 109   0]
 [  0   0   0 ...   0   0 102]]
Classification Report:
               precision    recall  f1-score   support

        s002       0.83      0.79      0.81       120
        s003       0.96      0.92      0.94       120
        s004       0.88      0.90      0.89       120
        s005       0.93      0.95      0.94       120
        s007       0.88      0.88      0.88       120
        s008       0.95      0.88      0.92       120
        s010       0.94      0.98      0.96       120
        s011       0.98      0.88      0.93       120
        s012       0.90      0.95      0.93       120
        s013       0.98      0.97      0.97       120
        s015       0.93      0.91      0.92       120
        s016       0.90      0.97 

## KNN

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
import joblib

# Define the parameter grid for grid search
param_grid = {'n_neighbors': [3, 5, 7, 9, 11, 13, 15,17,19,21,25]}

# Initialize the KNN model
knn = KNeighborsClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Save the best model
best_knn = grid_search.best_estimator_
# joblib.dump(best_knn, 'knn_best.joblib')

# Make predictions using the best model
y_train_pred_best = best_knn.predict(X_train)
y_test_pred_best = best_knn.predict(X_test)

# Calculate metrics for training set (best model)
train_accuracy_best = accuracy_score(y_train, y_train_pred_best)
train_precision_best = precision_score(y_train, y_train_pred_best, average='macro')
train_recall_best = recall_score(y_train, y_train_pred_best, average='macro')
train_f1_best = f1_score(y_train, y_train_pred_best, average='macro')
# train_confusion_best = confusion_matrix(y_train, y_train_pred_best)
train_classification_report_best = classification_report(y_train, y_train_pred_best)

# Calculate metrics for test set (best model)
test_accuracy_best = accuracy_score(y_test, y_test_pred_best)
test_precision_best = precision_score(y_test, y_test_pred_best, average='macro')
test_recall_best = recall_score(y_test, y_test_pred_best, average='macro')
test_f1_best = f1_score(y_test, y_test_pred_best, average='macro')
# test_confusion_best = confusion_matrix(y_test, y_test_pred_best)
test_classification_report_best = classification_report(y_test, y_test_pred_best)

# Print best parameters and metrics
print("Best Parameters:", grid_search.best_params_)
print("\nTraining Metrics (Best Model):")
print(f"Accuracy: {train_accuracy_best:.4f}")
print(f"Precision: {train_precision_best:.4f}")
print(f"Recall: {train_recall_best:.4f}")
print(f"F1-Score: {train_f1_best:.4f}")
# print("Confusion Matrix:\n", train_confusion_best)
print("Classification Report:\n", train_classification_report_best)

print("\nTest Metrics (Best Model):")
print(f"Accuracy: {test_accuracy_best:.4f}")
print(f"Precision: {test_precision_best:.4f}")
print(f"Recall: {test_recall_best:.4f}")
print(f"F1-Score: {test_f1_best:.4f}")
# print("Confusion Matrix:\n", test_confusion_best)
print("Classification Report:\n", test_classification_report_best)


Best Parameters: {'n_neighbors': 5}

Training Metrics (Best Model):
Accuracy: 0.8245
Precision: 0.8368
Recall: 0.8245
F1-Score: 0.8251
Classification Report:
               precision    recall  f1-score   support

        s002       0.51      0.85      0.64       280
        s003       0.79      0.89      0.84       280
        s004       0.68      0.82      0.74       280
        s005       0.77      0.92      0.84       280
        s007       0.75      0.82      0.79       280
        s008       0.75      0.80      0.78       280
        s010       0.84      0.93      0.88       280
        s011       0.94      0.85      0.90       280
        s012       0.91      0.87      0.89       280
        s013       0.82      0.84      0.83       280
        s015       0.82      0.78      0.80       280
        s016       0.62      0.90      0.74       280
        s017       0.88      0.95      0.91       280
        s018       0.89      0.82      0.86       280
        s019       0.86      0

## SVM

In [14]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import joblib

# Define the parameter grid for SVM
param_grid_svm = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Kernel types
    'gamma': ['scale', 'auto']  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
}

# Initialize the SVM model
svm = SVC()

# Perform grid search with cross-validation
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_svm.fit(X_train, y_train)

# # Save the best SVM model
best_svm = grid_search_svm.best_estimator_
# joblib.dump(best_svm, 'svm_best.joblib')

# Make predictions using the best SVM model
y_train_pred_best_svm = best_svm.predict(X_train)
y_test_pred_best_svm = best_svm.predict(X_test)

# Calculate metrics for the training set (best SVM model)
train_accuracy_best_svm = accuracy_score(y_train, y_train_pred_best_svm)
train_precision_best_svm = precision_score(y_train, y_train_pred_best_svm, average='macro')
train_recall_best_svm = recall_score(y_train, y_train_pred_best_svm, average='macro')
train_f1_best_svm = f1_score(y_train, y_train_pred_best_svm, average='macro')
train_confusion_best_svm = confusion_matrix(y_train, y_train_pred_best_svm)
train_classification_report_best_svm = classification_report(y_train, y_train_pred_best_svm)

# Calculate metrics for the test set (best SVM model)
test_accuracy_best_svm = accuracy_score(y_test, y_test_pred_best_svm)
test_precision_best_svm = precision_score(y_test, y_test_pred_best_svm, average='macro')
test_recall_best_svm = recall_score(y_test, y_test_pred_best_svm, average='macro')
test_f1_best_svm = f1_score(y_test, y_test_pred_best_svm, average='macro')
test_confusion_best_svm = confusion_matrix(y_test, y_test_pred_best_svm)
test_classification_report_best_svm = classification_report(y_test, y_test_pred_best_svm)

# Print best parameters and metrics
print("Best Parameters for SVM:", grid_search_svm.best_params_)
print("\nTraining Metrics (Best SVM Model):")
print(f"Accuracy: {train_accuracy_best_svm:.4f}")
print(f"Precision: {train_precision_best_svm:.4f}")
print(f"Recall: {train_recall_best_svm:.4f}")
print(f"F1-Score: {train_f1_best_svm:.4f}")
print("Confusion Matrix:\n", train_confusion_best_svm)
print("Classification Report:\n", train_classification_report_best_svm)

print("\nTest Metrics (Best SVM Model):")
print(f"Accuracy: {test_accuracy_best_svm:.4f}")
print(f"Precision: {test_precision_best_svm:.4f}")
print(f"Recall: {test_recall_best_svm:.4f}")
print(f"F1-Score: {test_f1_best_svm:.4f}")
print("Confusion Matrix:\n", test_confusion_best_svm)
print("Classification Report:\n", test_classification_report_best_svm)


Best Parameters for SVM: {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}

Training Metrics (Best SVM Model):
Accuracy: 0.9912
Precision: 0.9913
Recall: 0.9912
F1-Score: 0.9912
Confusion Matrix:
 [[276   0   0 ...   0   0   0]
 [  0 278   1 ...   0   0   0]
 [  2   1 271 ...   0   0   0]
 ...
 [  0   0   0 ... 276   3   0]
 [  0   0   0 ...   2 276   0]
 [  0   0   0 ...   0   0 274]]
Classification Report:
               precision    recall  f1-score   support

        s002       0.95      0.99      0.97       280
        s003       0.98      0.99      0.99       280
        s004       0.99      0.97      0.98       280
        s005       1.00      1.00      1.00       280
        s007       0.96      0.96      0.96       280
        s008       0.99      0.96      0.97       280
        s010       0.99      1.00      0.99       280
        s011       0.99      0.99      0.99       280
        s012       1.00      0.99      0.99       280
        s013       1.00      0.99      0.99       

In [15]:
## Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

# Encode labels as integers if they are not already
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Define the parameter grid for Logistic Regression
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']  # 'liblinear' is good for small datasets, 'saga' can handle large datasets
}

# Initialize the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)

# Perform grid search with cross-validation
grid_search_log_reg = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_log_reg.fit(X_train, y_train)

# Get the best model from grid search
best_log_reg = grid_search_log_reg.best_estimator_

# Make predictions on the training set using the best model
y_train_pred_best = best_log_reg.predict(X_train)

# Make predictions on the test set using the best model
y_test_pred_best = best_log_reg.predict(X_test)

# Calculate metrics for the training set
train_accuracy_best = accuracy_score(y_train, y_train_pred_best)
train_precision_best = precision_score(y_train, y_train_pred_best, average='macro')
train_recall_best = recall_score(y_train, y_train_pred_best, average='macro')
train_f1_best = f1_score(y_train, y_train_pred_best, average='macro')
train_confusion_best = confusion_matrix(y_train, y_train_pred_best)
train_classification_report_best = classification_report(y_train, y_train_pred_best)

# Calculate metrics for the test set
test_accuracy_best = accuracy_score(y_test, y_test_pred_best)
test_precision_best = precision_score(y_test, y_test_pred_best, average='macro')
test_recall_best = recall_score(y_test, y_test_pred_best, average='macro')
test_f1_best = f1_score(y_test, y_test_pred_best, average='macro')
test_confusion_best = confusion_matrix(y_test, y_test_pred_best)
test_classification_report_best = classification_report(y_test, y_test_pred_best)

# Print best parameters and metrics
print("Best Parameters:", grid_search_log_reg.best_params_)
print("\nTraining Metrics (Best Model):")
print(f"Accuracy: {train_accuracy_best:.4f}")
print(f"Precision: {train_precision_best:.4f}")
print(f"Recall: {train_recall_best:.4f}")
print(f"F1-Score: {train_f1_best:.4f}")
print("Confusion Matrix:\n", train_confusion_best)
print("Classification Report:\n", train_classification_report_best)

print("\nTest Metrics (Best Model):")
print(f"Accuracy: {test_accuracy_best:.4f}")
print(f"Precision: {test_precision_best:.4f}")
print(f"Recall: {test_recall_best:.4f}")
print(f"F1-Score: {test_f1_best:.4f}")
print("Confusion Matrix:\n", test_confusion_best)
print("Classification Report:\n", test_classification_report_best)


Best Parameters: {'C': 100, 'solver': 'liblinear'}

Training Metrics (Best Model):
Accuracy: 0.8267
Precision: 0.8271
Recall: 0.8266
F1-Score: 0.8247
Confusion Matrix:
 [[232   6   2 ...   0   1   0]
 [  7 258  14 ...   0   1   0]
 [  6   7 259 ...   0   0   0]
 ...
 [  0   0   0 ... 309   6   3]
 [  5   0   2 ...   3 259   5]
 [  0   0   0 ...   0   6 215]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.71      0.70       328
           1       0.83      0.78      0.80       331
           2       0.80      0.79      0.80       327
           3       0.83      0.86      0.85       327
           4       0.81      0.66      0.73       346
           5       0.78      0.70      0.74       313
           6       0.89      0.90      0.90       303
           7       0.86      0.83      0.85       325
           8       0.86      0.76      0.81       307
           9       0.85      0.84      0.84       328
          10       0