In [None]:
#from utils.read_data import Dynamic_Dataset, Processing_Dataset
%run 09_utils.ipynb
from utils.vectorize_sentence import Embeddings
#all_util

<h2 id="Dynamic_Dataset" class="doc_header"><code>class</code> <code>Dynamic_Dataset</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>Dynamic_Dataset</code>(**`ground_truth`**, **`path`**, **`isZip`**)

This class efficiently 'stores' a dataset. Only a list of filenames and
mappings to their ground truth values are stored in memory. The file
contents are only brought into memory when requested.

This class supports indexing, slicing, and iteration.

A user can treat an instance of this class exactly as they would a list.
Indexing an instance of this class will return a tuple consisting of
the ground truth value and the file content of the filename at that index.

A user can request the filename at an index with get_id(index)

Example:

        dataset = Dynamic_Dataset(ground_truth)

        print(dataset.get_id(0))
                -> gitlab_79.txt

        print(dataset[0])
                -> ('(1,0)', 'The currently used Rails version, in the stable ...

        for x in dataset[2:4]:
                print(x)
                        -> ('(1,0)', "'In my attempt to add 2 factor authentication ...
                        -> ('(1,0)', 'We just had an admin accidentally push to a ...

In [None]:
## Testing for Processing_Dataset and Dynamic_Dataset

In [None]:
# Tests if ground truth text document is parsed correctly
# method tested: get_ground_truth()
def test_get_gt():
    path = "../test/test_gt_good/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    expected = {'gitlab_79.txt':'(1,0)'}
    assert(ground_truth == expected)

#util
test_get_gt()

In [None]:
# Tests if an error is correctly raised if ground truth text
# contains a duplicate
# method tested: get_ground_truth()
def test_get_gt_dup_error():
    path = "../test/test_gt_dup/"
    process_unit = Processing_Dataset(path)
    try:
        ground_truth = process_unit.get_ground_truth()
        assert(False)
    except KeyError:
        assert(True)

#util
test_get_gt_dup_error()

In [None]:
# Tests if we are able to retrieve the data as labeled
# method tested: __getitem__
def test_dd_get():
    path = "../test/test_gt_multiple/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    dataset = Dynamic_Dataset(ground_truth, "../test/", True)
    expected = ('(1,0)', b'The currently used Rails version, in the stable branch, is insecure\n\nYou should update the Gemfile.lock to hotfix this.\n\nhttp://weblog.rubyonrails.org/2014/2/18/Rails_3_2_17_4_0_3_and_4_1_0_beta2_have_been_released/')
    assert(dataset[0]==expected)
    expected = ('(1,0)', b"'This is a useful security improvement, that I recommend gets integrated into gitlab. It protects users, in the event that their passwords get stolen from other sites, etc. I found a good gem for this: http://rubydoc.info/github/mdp/rotp/master/frames, however, given that it appears Gitlab uses Devise for auth, we should probably use this plugin: https://github.com/wmlele/devise-otp\n\nI intend to submit a Merge Request for this, so I'll outline my design for the system here (in case anyone has feedback/wants to help):\n\n### OTP Strategy\nI'm going with time-based (TOTP). Its requires no storage implications, per-user (other than a 32bit secret key). Time-based keys are very common, Google uses this strategy to protect GMail/Apps customers.\n\n### Database Augmentation\n**NOTE:** Given the existence of devise-otp, this may no longer be necessary.\n\nI will add new table, with a foreign key reference to a `user_id` column,  and `totp_secret` column. The existence of a row implies that this feature is enabled for a user. This table could be enhanced further down the road to support other types of otp strategies, if need be. This would also make future data migrations, in the event of further enhancement, easier to manage.\n\n### UI Augmentation\n#### User Account Settings\nWe'll add a simple checkbox that a user must toggle to enable this feature. Once the checkbox is toggled, a modal will appear, displaying a QR code that the user will then scan with their mobile device, to start generating OTP codes. There will also be a box for the user to provide a newly generated OTP code to verify the service is working properly, for their account. Users will also need the ability to also reset the secret, in case they lose their phone etc.\n\n#### Admin Settings\nWe'll need to allow admins to toggle if this feature is enabled, for a given user account. Assumed use case would be to contact an admin to disable OTP codes so you can log back in, re-enable it, and setup a new secret for yourself.\n\n#### Sign In\nOnce the user has provided a proper username/password pair, if the flag is enabled, they will be redirected to a page that asks them to enter an OTP code, before they can proceed into the protected areas of the site.\n\n------\n\n**QUESTION: What would be the best course of action to manage the scenario where a user has lost their phone, and can no longer regenerate OTP codes to access their account? How can we let them back in to reset their OTP secret?** So far, my assumption is that the user would contact their gitlab administrators and they would disable OTP for them. However, one potential issue with this is that the attacker, who may have the user's password, may also have access to their e-mail. This would allow them to ask the administrator to disable OTP, and gain access to their data. Likely the verification protocol for admins should be org-specific, and not in scope of this work. Unsure how gitlab cloud staff wants to manage this, for their users. \n\n**UPDATE:** Its worth noting that using devise-otp provides a list of emergency HTOP recovery tokens that can be used, if we expose that functionality.")
    assert(dataset[1]==expected)
#util
test_dd_get()

In [None]:
# Tests if slicing works on Dynamic_Dataset
def test_dd_slice():
    path = "../test/test_gt_multiple/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    dataset = Dynamic_Dataset(ground_truth, "../test/", True)
    assert(len(dataset)==3)
    sliced = dataset[1:]
    assert(len(sliced)==2)
    assert(sliced[0] == dataset[0])
#util
test_dd_slice()

TypeError: __init__() missing 2 required positional arguments: 'path' and 'isZip'

In [None]:
# Tests if we are only indexing items according to ground truth txt
# method tested: __getitem__
def test_dd_get_error():
    path = "../test/test_gt_good/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    dataset = Dynamic_Dataset(ground_truth, "../test/", True)
    try:
        dataset[1]
        assert(False)
    except IndexError as e:
        assert(True)
#util
test_dd_get_error()

In [None]:
# Tests if we are able to retrieve just the id of the data at specific index
# method tested: get_id()
def test_dd_get_id():
    path = "../test/test_gt_good/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    dataset = Dynamic_Dataset(ground_truth, "../test/", True)
    expected = 'gitlab_79.txt'
    assert(dataset.get_id(0) == expected)
#util
test_dd_get_id()

In [None]:
# Tests if we are only indexing items according to ground_truth
# method tested: get_id()
def test_dd_get_id_error():
    path = "../test/test_gt_good/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    dataset = Dynamic_Dataset(ground_truth, "../test/", True)
    try:
        dataset.get_id(1)
        assert(False)
    except IndexError as e:
        assert(True)
#util
test_dd_get_id_error()

In [None]:
# Tests is length method is properly implemented
# method tested: len()
def test_dd_len():
    path = "../test/test_gt_good/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    dataset = Dynamic_Dataset(ground_truth, "../test/", True)
    assert(len(dataset)==1)
    
    path = "../test/test_gt_multiple/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    dataset = Dynamic_Dataset(ground_truth, "../test/", True)
    
    assert(len(dataset)==3)
#util
test_dd_len()

In [None]:
# Tests if iteration is properly implemented
# method tested: __iter__
def test_dd_iter():
    path = "../test/test_gt_multiple/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    dataset = Dynamic_Dataset(ground_truth, "../test/", True)
    expected = []
    # assuming that len and indexing are implemented correctly
    for i in range(len(dataset)):
        expected.append(dataset[i])
    actual = []
    for data in dataset:
        actual.append(data)
    assert(expected == actual)
#util
test_dd_iter()

In [None]:
# Tests if Dyanmic_Dataset is immutable
# method tested: __set_item__
def test_dd_set_item_error():
    path = "../test/test_gt_good/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    dataset = Dynamic_Dataset(ground_truth, "../test/", True)
    try:
        dataset[0] = "asdf"
        assert(False)
    except ValueError as e:
        assert(True)
#util
test_dd_set_item_error()

In [None]:
# Tests if an error is correctly raised if malformed
# data is detected in document
# method tested: get_test_and_training
def test_get_test_training_value_error():
    path = "../test/test_gt_bad/"
    process_unit = Processing_Dataset(path)
    try:
        ground_truth = process_unit.get_ground_truth()
        process_unit.get_test_and_training(ground_truth)
        assert(False)
    except ValueError:
        assert(True)
#util
test_get_test_training_value_error()

In [None]:
# Tests if we can get contents of an issue
# method tested: get_issue(filename)

## TODO remake once refactoring team fixes this method
def test_get_issue():
    path = "../test/test_gt_bad/"
    process_unit = Processing_Dataset(path)
    process_unit.get_issue("test")
#util
test_get_issue()

FileNotFoundError: [Errno 2] No such file or directory: 'combined_dataset/issues/test'

In [None]:
# Tests if an error is correctly raised if malformed
# data is detected in document
# method tested: get_test_and_training
def test_get_train_test_split():
    path = "../data/augmented_dataset/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    ratio = 0.1
    train, test = process_unit.get_test_and_training(ground_truth, test_ratio = ratio, isZip = True)
    actual_ratio = len(train)/len(test)

    assert(actual_ratio>=ratio-.02 and actual_ratio<=ratio+0.2)
    ratio = 0.5
    train, test = process_unit.get_test_and_training(ground_truth, test_ratio = ratio, isZip = True)
    actual_ratio = len(train)/len(test)

    assert(actual_ratio>=ratio-.02 and actual_ratio<=ratio+0.2)
#util
test_get_train_test_split()

0.11110898478614487
0.9999655540628983


AssertionError: 