# Setup
[Required] Download the pre-trained models and put it in: './keras_models'.
You should have something like this:
```
keras_models
├── baseline
│   └── password_model.h5
├── UNCM_medium
│   ├── conf_encoder.h5
│   └── password_model.h5
└── UNCM_medium_8096con_2048pm
    ├── conf_encoder.h5
    └── password_model.h5
```

In [1]:
import os

import sys, importlib, pprint
import tensorflow as tf
import numpy as np

from input_pipeline import make_dataset
from tester import Tester
from inference import ancestral_sampling


setup_module = 'confings.UNCM_medium_8096con_2048pm'
name_run = 'UNCM_medium_8096con_2048pm'

setup = importlib.import_module(setup_module)
hparams = setup.hparams  

input_fn =  hparams['model_class'].make_get_input_tensors(hparams)

# load pre-trained UNCM
uncm = hparams['model_class'].import_models(hparams, name_run)
conf_encoder_uncm, cpassmodel_uncm = uncm

t = Tester(conf_encoder_uncm, cpassmodel_uncm, input_fn, hparams)



# How to compute probabilities of plaintext passwords

The "compute_probability_from_file" function takes as input the path to a file with the following format:

We have a pair (email address, password) per row. Each row has 4 entries separated by '\t'. Those are:

* email username
* email provider (without '@')
* email top domain (without the initial '.')
* password

For instance, the entry *(dario.pasquini@gmail.com, password123)* becomes:

*"dario.pasquini\tgmail\tcom\tpassword123"*

An example is provided in *examples/fakeleak.txt*.

___

Given the input file, the function automatically computes the seed from the email addresses and then assign probabilities to each password.

In [2]:
path = 'examples/findfriendz.com__NOHASH__Social.txt'
(password, probability), seed = t.compute_probability_from_file(path, return_X=True)

pp_uncm = list(sorted(zip(np.round(probability, 6), password), key=lambda x: -x[0]))
print(*pp_uncm[:100], sep='\n')

Actual number of users sampled for SEED computation:  8192 

(0.00076, 'password')
(0.000397, 'indian')
(0.000351, 'welcome')
(0.000339, 'qwerty')
(0.000315, 'krishna')
(0.000292, 'chinnu')
(0.000273, 'saibaba')
(0.000272, 'hanuman')
(0.00027, 'pakistan')
(0.000266, 'sairam')
(0.000241, 'iloveyou')
(0.000226, 'aditya')
(0.000224, 'abhishek')
(0.000211, 'jaimatadi')
(0.000209, 'manisha')
(0.000208, 'abc123')
(0.000202, 'sanjay')
(0.000198, 'mahesh')
(0.000197, 'anuradha')
(0.000194, 'loveyou')
(0.00019, 'bharat')
(0.000187, 'computer')
(0.000186, 'sharma')
(0.000184, 'bismillah')
(0.000181, 'information')
(0.000179, 'zxcvbnm')
(0.000177, 'mylove')
(0.000172, 'vikram')
(0.000167, 'london')
(0.000165, 'chirag')
(0.000165, 'mother')
(0.000162, 'sachin')
(0.000158, 'aaaaaa')
(0.000155, 'lovely')
(0.000153, 'karthik')
(0.000152, 'sweety')
(0.000151, 'abcdefgh')
(0.000151, 'prakash')
(0.00015, 'india123')
(0.00015, 'santosh')
(0.000149, 'ganesh')
(0.000149, 'anusha')
(0.000148, 'success')
(0.

# For non-UNCM
The same goes for standard models. The only differece is that the model does not exploit the email addresses to adapt to the target.

In [3]:
setup_module = 'confings.baseline'
name_run = 'baseline'

setup = importlib.import_module(setup_module)
hparams_baseline = setup.hparams  

input_fn =  hparams_baseline['model_class'].make_get_input_tensors(hparams_baseline)

# load pre-trained UNCM
_, passmodel_baseline = hparams_baseline['model_class'].import_models(hparams_baseline, name_run)

t_baseline = Tester(None, passmodel_baseline, input_fn, hparams_baseline)



In [4]:
path = 'examples/findfriendz.com__NOHASH__Social.txt'
(password, probability), _ = t_baseline.compute_probability_from_file(path, return_X=True)

pp_baseline = list(sorted(zip(np.round(probability, 6), password), key=lambda x: -x[0]))
print(*pp_baseline[:100], sep='\n')

(0.000491, 'qwerty')
(0.000275, 'password')
(0.000125, 'azerty')
(0.00012, 'qwerty123')
(0.000109, 'abc123')
(0.000102, 'loulou')
(0.0001, 'daniel')
(9.8e-05, 'r123456')
(9.1e-05, 'india123')
(8.9e-05, 'welcome123')
(8.8e-05, 'iloveyou')
(8.7e-05, 'ferrari')
(8.4e-05, 'abcd1234')
(8.2e-05, 'killer')
(7.9e-05, 'zxcvbnm')
(7.6e-05, 'dragon')
(7.4e-05, 'martin')
(7.3e-05, 'fuckyou')
(7.2e-05, 'alexander')
(6.6e-05, 'aaaaaa')
(6.5e-05, 'master')
(6.5e-05, 'robert')
(6.4e-05, 'samsung')
(6.1e-05, 'computer')
(6.1e-05, 'asdfghjkl')
(5.9e-05, 'thomas')
(5.7e-05, 'freedom')
(5.6e-05, 'junior')
(5.6e-05, 'liverpool')
(5.6e-05, 'family')
(5.6e-05, 'q1w2e3r4')
(5.6e-05, 'welcome')
(5.3e-05, 'michelle')
(5.2e-05, '123qwe')
(5e-05, 'christian')
(4.9e-05, 'princess')
(4.8e-05, 'sunshine')
(4.7e-05, 'anthony')
(4.7e-05, 'qwertyuiop')
(4.7e-05, 'asdf1234')
(4.6e-05, 'rachel')
(4.6e-05, 'victor')
(4.4e-05, 'a123456789')
(4.4e-05, 'arsenal')
(4.2e-05, 'shadow')
(4.2e-05, 'a1b2c3d4')
(3.9e-05, 'flower')


# Comparison on the two rankings

In [5]:
import pandas as pd

pd.set_option('display.max_rows', None)

df = pd.DataFrame()
df['baseline'] = [x[1] for x in pp_baseline]
df['UNCM'] = [x[1] for x in pp_uncm]
df.head(100)

Unnamed: 0,baseline,UNCM
0,qwerty,password
1,password,indian
2,azerty,welcome
3,qwerty123,qwerty
4,abc123,krishna
5,loulou,chinnu
6,daniel,saibaba
7,r123456,hanuman
8,india123,pakistan
9,welcome123,sairam
