# Steps

1. Prepare data
2. 

In [12]:
import torch
import torch.nn.functional as F
import numpy as np

In [2]:
n_epochs = 5
lr = 0.01
n_folds = 5
lstm_input_size = 32
hidden_state_size = 256
n_layers = 2
dropout = 0.125
bidirectional = True
batch_size = 64
num_sequence_layers = 2
output_dim = 2                       # !!!!!!!!!!!!!!!!!!!!!!!!
num_time_steps = 30                 # !!!!!!!!!!!!!!
rnn_type = 'LSTM'

device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device("cpu")


In [3]:
from LSTM import Bi_RNN
from build_datasets import CharacterDataset
from train import accuracy

In [6]:
model = Bi_RNN(lstm_input_size, hidden_state_size, batch_size=batch_size, output_dim=output_dim, num_layers=num_sequence_layers, rnn_type=rnn_type)
model.load_state_dict(torch.load("LSTM_trained_15000words_8epochs"))
model.eval()

Bi_RNN(
  (embedding): Embedding(101, 32, padding_idx=100)
  (lstm): LSTM(32, 256, num_layers=2, batch_first=True, dropout=0.125, bidirectional=True)
  (linear1): Linear(in_features=512, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=8, bias=True)
  (linear3): Linear(in_features=8, out_features=2, bias=True)
  (tanh): Tanh()
  (maxpool): MaxPool1d(kernel_size=32, stride=32, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.125, inplace=False)
)

In [7]:
from readdata import read_passwords

In [8]:
def pytorch_predict(model, test_loader, device):
    model.eval()
    
    all_outputs = torch.tensor([], device=device)
    
    # deactivate autograd engine to reduce memory usage and increase speed
    with torch.no_grad():
        for data in test_loader:
            inputs = [i.to(device) for i in data[:-1]]
            
            outputs = model(*inputs)
            all_outputs = torch.cat((all_outputs, outputs), 0)
    #print(all_outputs)
    _, y_pred = torch.max(all_outputs, 1)
    y_pred = y_pred.cpu().numpy()
    y_pred_prob = F.softmax(all_outputs, dim=1).cpu().numpy()
    
    return y_pred, y_pred_prob

In [9]:
filename = '10-million-password-list-top-1000000.txt'

x, _ = read_passwords(filename, 0, 10000)
y = torch.zeros(len(x), 2)
data = CharacterDataset(x, y)
data_loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=False, drop_last = True)
y_pred, y_pred_prob = pytorch_predict(model, data_loader, device)

In [10]:
print(y_pred)
print(y_pred_prob)

[0 1 0 ... 1 1 1]
[[9.9979848e-01 2.0151827e-04]
 [4.3495941e-01 5.6504065e-01]
 [9.9994445e-01 5.5513217e-05]
 ...
 [4.6404362e-01 5.3595638e-01]
 [1.9711031e-01 8.0288965e-01]
 [3.4972817e-01 6.5027183e-01]]


In [13]:
np.mean(y_pred)

0.42357772435897434

In [34]:
from readdata import loaddata

languages = ['English', 'Spanish', 'Dutch', 'Arabic', 'Chinese']
lang_files = {
    'English' : "eng_news_2020_1M-words.txt",
    'Spanish' : "spa_news_2020_1M-words.txt"
}

lang_split = {
    'English' : 1.0,
    #'Spanish' : 0.0
}

pw_filename = "10-million-password-list-top-1000000.txt"
text_filename = "eng_news_2020_1M-words.txt"
comp_nr_lines = 10
nr_lines = 10000

passwords, words, comparison_pw = loaddata(pw_filename, languages, lang_files, lang_split, comp_nr_lines, nr_lines)
all_words = passwords + words

def create_feature_labels(passwords, words):
    return np.concatenate((np.ones(len(passwords)), np.zeros(len(words))), axis=0)

def create_nn_labels(passwords, words):
    return torch.hstack((torch.concatenate((torch.ones((1,len(passwords))), torch.zeros(1,len(words))), axis=0),
                         torch.concatenate((torch.zeros((1,len(passwords))), torch.ones((1,len(words)))), axis=0)))

labels_nn = create_nn_labels(passwords, words)
labels_features = create_feature_labels(passwords, words)

Nr of passwords: 10000
Nr of words: 10000


In [41]:
batch_size-(nr_lines*2)%batch_size

32

In [35]:
x = np.array(all_words).flatten()
y = torch.transpose(labels_nn, 0, 1)
data = CharacterDataset(x, y)
data_loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=False, drop_last = True)
y_pred, y_pred_prob = pytorch_predict(model, data_loader, device)

In [46]:
from sklearn.metrics import f1_score, accuracy_score
slic = -((nr_lines*2)%batch_size)
print(f1_score(y_pred, labels_features[:slic]))
print(accuracy_score(y_pred, labels_features[:slic]))

0.35518644352166434
0.2301181891025641
