In [None]:
#from utils.read_data import Dynamic_Dataset, Processing_Dataset
from securereqnet.utils import Dynamic_Dataset, Processing_Dataset, Embeddings
#all_util

In [None]:
## Testing for Processing_Dataset and Dynamic_Dataset

In [None]:
# Tests if ground truth text document is parsed correctly
# method tested: get_ground_truth()
def test_get_gt():
    path = "../test/test_gt_good/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    expected = {'gitlab_79.txt':'(1,0)'}
    assert(ground_truth == expected)

#util
test_get_gt()

In [None]:
# Tests if an error is correctly raised if ground truth text
# contains a duplicate
# method tested: get_ground_truth()
def test_get_gt_dup_error():
    path = "../test/test_gt_dup/"
    process_unit = Processing_Dataset(path)
    try:
        ground_truth = process_unit.get_ground_truth()
        assert(False)
    except KeyError:
        assert(True)

#util
test_get_gt_dup_error()

In [None]:
# Tests if we are able to retrieve the data as labeled
# method tested: __getitem__
def test_dd_get():
    path = "../test/test_gt_multiple/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    dataset = Dynamic_Dataset(ground_truth, "../test/", True)
    expected = ('(1,0)', b'The currently used Rails version, in the stable branch, is insecure\n\nYou should update the Gemfile.lock to hotfix this.\n\nhttp://weblog.rubyonrails.org/2014/2/18/Rails_3_2_17_4_0_3_and_4_1_0_beta2_have_been_released/')
    assert(dataset[0]==expected)
    expected = ('(1,0)', b"'This is a useful security improvement, that I recommend gets integrated into gitlab. It protects users, in the event that their passwords get stolen from other sites, etc. I found a good gem for this: http://rubydoc.info/github/mdp/rotp/master/frames, however, given that it appears Gitlab uses Devise for auth, we should probably use this plugin: https://github.com/wmlele/devise-otp\n\nI intend to submit a Merge Request for this, so I'll outline my design for the system here (in case anyone has feedback/wants to help):\n\n### OTP Strategy\nI'm going with time-based (TOTP). Its requires no storage implications, per-user (other than a 32bit secret key). Time-based keys are very common, Google uses this strategy to protect GMail/Apps customers.\n\n### Database Augmentation\n**NOTE:** Given the existence of devise-otp, this may no longer be necessary.\n\nI will add new table, with a foreign key reference to a `user_id` column,  and `totp_secret` column. The existence of a row implies that this feature is enabled for a user. This table could be enhanced further down the road to support other types of otp strategies, if need be. This would also make future data migrations, in the event of further enhancement, easier to manage.\n\n### UI Augmentation\n#### User Account Settings\nWe'll add a simple checkbox that a user must toggle to enable this feature. Once the checkbox is toggled, a modal will appear, displaying a QR code that the user will then scan with their mobile device, to start generating OTP codes. There will also be a box for the user to provide a newly generated OTP code to verify the service is working properly, for their account. Users will also need the ability to also reset the secret, in case they lose their phone etc.\n\n#### Admin Settings\nWe'll need to allow admins to toggle if this feature is enabled, for a given user account. Assumed use case would be to contact an admin to disable OTP codes so you can log back in, re-enable it, and setup a new secret for yourself.\n\n#### Sign In\nOnce the user has provided a proper username/password pair, if the flag is enabled, they will be redirected to a page that asks them to enter an OTP code, before they can proceed into the protected areas of the site.\n\n------\n\n**QUESTION: What would be the best course of action to manage the scenario where a user has lost their phone, and can no longer regenerate OTP codes to access their account? How can we let them back in to reset their OTP secret?** So far, my assumption is that the user would contact their gitlab administrators and they would disable OTP for them. However, one potential issue with this is that the attacker, who may have the user's password, may also have access to their e-mail. This would allow them to ask the administrator to disable OTP, and gain access to their data. Likely the verification protocol for admins should be org-specific, and not in scope of this work. Unsure how gitlab cloud staff wants to manage this, for their users. \n\n**UPDATE:** Its worth noting that using devise-otp provides a list of emergency HTOP recovery tokens that can be used, if we expose that functionality.")
    assert(dataset[1]==expected)
#util
test_dd_get()

In [None]:
# Tests if slicing works on Dynamic_Dataset
def test_dd_slice():
    path = "../test/test_gt_multiple/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    dataset = Dynamic_Dataset(ground_truth, "../test/", True)
    assert(len(dataset)==4)
    sliced = dataset[1:]
    assert(len(sliced)==3)
    assert(sliced[0] == dataset[1])
#util
test_dd_slice()

In [None]:
# Tests if we are only indexing items according to ground truth txt
# method tested: __getitem__
def test_dd_get_error():
    path = "../test/test_gt_good/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    dataset = Dynamic_Dataset(ground_truth, "../test/", True)
    try:
        dataset[1]
        assert(False)
    except IndexError as e:
        assert(True)
#util
test_dd_get_error()

In [None]:
# Tests if we are able to retrieve just the id of the data at specific index
# method tested: get_id()
def test_dd_get_id():
    path = "../test/test_gt_good/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    dataset = Dynamic_Dataset(ground_truth, "../test/", True)
    expected = 'gitlab_79.txt'
    assert(dataset.get_id(0) == expected)
#util
test_dd_get_id()

In [None]:
# Tests if we are only indexing items according to ground_truth
# method tested: get_id()
def test_dd_get_id_error():
    path = "../test/test_gt_good/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    dataset = Dynamic_Dataset(ground_truth, "../test/", True)
    try:
        dataset.get_id(1)
        assert(False)
    except IndexError as e:
        assert(True)
#util
test_dd_get_id_error()

In [None]:
# Tests is length method is properly implemented
# method tested: len()
def test_dd_len():
    path = "../test/test_gt_good/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    dataset = Dynamic_Dataset(ground_truth, "../test/", True)
    assert(len(dataset)==1)
    
    path = "../test/test_gt_multiple/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    dataset = Dynamic_Dataset(ground_truth, "../test/", True)
    
    assert(len(dataset)==4)
#util
test_dd_len()

In [None]:
# Tests if iteration is properly implemented
# method tested: __iter__
def test_dd_iter():
    path = "../test/test_gt_multiple/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    dataset = Dynamic_Dataset(ground_truth, "../test/", True)
    expected = []
    # assuming that len and indexing are implemented correctly
    for i in range(len(dataset)):
        expected.append(dataset[i])
    actual = []
    for data in dataset:
        actual.append(data)
    assert(expected == actual)
#util
test_dd_iter()

In [None]:
# Tests if Dyanmic_Dataset is immutable
# method tested: __set_item__
def test_dd_set_item_error():
    path = "../test/test_gt_good/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    dataset = Dynamic_Dataset(ground_truth, "../test/", True)
    try:
        dataset[0] = "asdf"
        assert(False)
    except ValueError as e:
        assert(True)
#util
test_dd_set_item_error()

In [None]:
# Tests if an error is correctly raised if malformed
# data is detected in document
# method tested: get_test_and_training
def test_get_test_training_value_error():
    path = "../test/test_gt_bad/"
    process_unit = Processing_Dataset(path)
    try:
        ground_truth = process_unit.get_ground_truth()
        process_unit.get_test_and_training(ground_truth)
        assert(False)
    except ValueError:
        assert(True)
#util
test_get_test_training_value_error()

In [None]:
# Tests if we can get contents of an issue
# method tested: get_issue(filename)

def test_get_issue_bad():
    path = "../test/test_gt_bad/"
    process_unit = Processing_Dataset(path)
    try:
        process_unit.get_issue("test")
        assert(False)
    except FileNotFoundError:
        assert(True)
#util
test_get_issue_bad()

In [None]:
# Tests if we can get contents of an issue
# method tested: get_issue(filename)

def test_get_issue():
    path = "../test/test_gt_good/"
    process_unit = Processing_Dataset(path)
    try:
        process_unit.get_issue("full_ground_truth.txt")
        assert(True)
    except FileNotFoundError:
        assert(False)
#util
test_get_issue()

In [None]:
# Tests if an error is correctly raised if malformed
# data is detected in document
# method tested: get_test_and_training
def test_get_train_test_split():
    path = "../data/augmented_dataset/"
    process_unit = Processing_Dataset(path)
    ground_truth = process_unit.get_ground_truth()
    ratio = 0.1
    train, test = process_unit.get_test_and_training(ground_truth, test_ratio = ratio, isZip = True)
    actual_ratio = len(train)/(len(test)+len(train))

    assert(actual_ratio>=ratio-.02 and actual_ratio<=ratio+0.02)
    ratio = 0.5
    train, test = process_unit.get_test_and_training(ground_truth, test_ratio = ratio, isZip = True)
    actual_ratio = len(train)/(len(test)+len(train))

    assert(actual_ratio>=ratio-.02 and actual_ratio<=ratio+0.02)
#util
test_get_train_test_split()

In [None]:
# Test vectorizing 
import numpy as np

# set up for testing
embeddings = Embeddings()
embed_path = '../test/test_embeddings/test_embeddings.csv'
embeddings_dict = embeddings.get_embeddings_dict(embed_path)

user_vector = [-4.6527834,
                1.1271917,
                -5.386773,
                -2.9345105,
                12.707992,
                -3.5409136,
                2.0961823,
                -0.42728585,
                -1.3112166,
                0.34084892,
                -6.431007,
                0.106831096,
                1.8986382,
                -2.3929365,
                2.5768406,
                2.744601,
                -1.8507555,
                0.09059698,
                -0.2394328,
                0.66318494]

use_vector = [-1.38487,
                4.447382,
                -0.97873485,
                2.3377173,
                4.7804713,
                -2.8270018,
                0.26988912,
                2.7355337,
                0.5191395,
                1.0389539,
                -1.2465893,
                0.13766454,
                1.3388132,
                -3.7388134,
                1.8178437,
                -1.1611614,
                3.4868627,
                -0.4555853,
                1.4885712,
                20.297823]

version_vector = [-7.0424953,
                7.978198,
                -1.5168871,
                -6.562944,
                19.298594,
                1.7695183,
                2.5408025,
                -3.7058382,
                -0.82634467,
                -4.577317,
                -2.4452372,
                -5.119848,
                1.5269793,
                1.1844287,
                0.2566476,
                -4.926136,
                -3.5850575,
                -3.8257978,
                2.7975578,
                6.4273405]

In [None]:
# tests if we can map the word user to its vector from the dictionary
def test_vectorize_one_word_user():
    sentence = "user"
    corresponding_vector = np.array([user_vector])
    vectorized = embeddings.vectorize(sentence, embeddings_dict)
    assert np.allclose(vectorized, corresponding_vector)
    
test_vectorize_one_word_user()

In [None]:
# tests if we can clean the word users then map to user vector from the dictionary
def test_vectorize_one_word_users_cleaned_to_user():
    sentence = "users"
    corresponding_vector = np.array([user_vector])
    vectorized = embeddings.vectorize(sentence, embeddings_dict)
    assert np.allclose(vectorized, corresponding_vector)
    
test_vectorize_one_word_users_cleaned_to_user()

In [None]:
# tests if we can clean the word use then map to use vector from the dictionary
def test_vectorize_one_word_use():
    sentence = "use"
    corresponding_vector = np.array([use_vector])
    vectorized = embeddings.vectorize(sentence, embeddings_dict)
    assert np.allclose(vectorized, corresponding_vector)

test_vectorize_one_word_use()

In [None]:
# tests if we can clean the word uses then map to use vector from the dictionary
def test_vectorize_one_word_uses_cleaned_to_use():
    sentence = "uses"
    corresponding_vector = np.array([use_vector])
    vectorized = embeddings.vectorize(sentence, embeddings_dict)
    assert np.allclose(vectorized, corresponding_vector)

test_vectorize_one_word_uses_cleaned_to_use()

In [None]:
# tests if we can clean the word used then map to use vector from the dictionary
def test_vectorize_one_word_used_cleaned_to_use():
    sentence = "used"
    corresponding_vector = np.array([use_vector])
    vectorized = embeddings.vectorize(sentence, embeddings_dict)
    assert np.allclose(vectorized, corresponding_vector)

test_vectorize_one_word_used_cleaned_to_use()

In [None]:
# tests if we can map to version vector from the dictionary
def test_vectorize_one_word_version():
    sentence = "version"
    corresponding_vector = np.array([version_vector])
    vectorized = embeddings.vectorize(sentence, embeddings_dict)
    assert np.allclose(vectorized, corresponding_vector)
    
test_vectorize_one_word_version()

In [None]:
# tests if we can clean the word versions then map to version vector from the dictionary
def test_vectorize_one_word_versions_cleaned_to_version():
    sentence = "versions"
    corresponding_vector = np.array([version_vector])
    vectorized = embeddings.vectorize(sentence, embeddings_dict)
    assert np.allclose(vectorized, corresponding_vector)
    
test_vectorize_one_word_versions_cleaned_to_version()

In [None]:
# tests if we can clean the word versioned then map to version vector from the dictionary
def test_vectorize_one_word_versioned_cleaned_to_version():
    sentence = "versioned"
    corresponding_vector = np.array([version_vector])
    vectorized = embeddings.vectorize(sentence, embeddings_dict)
    assert np.allclose(vectorized, corresponding_vector)
    
test_vectorize_one_word_versioned_cleaned_to_version()

In [None]:
# test empty case for vectorization - should be an empty vector
def test_vectorize_no_words():
    sentence = ""
    corresponding_vector = np.array([])
    vectorized = embeddings.vectorize(sentence, embeddings_dict)
    assert np.allclose(vectorized, corresponding_vector)

test_vectorize_no_words()

In [None]:
# test vectorizing non dictionary words - should return empty vector
def test_vectorize_non_dict_words():
    sentence = "donuts are tasty"
    corresponding_vector = np.array([])
    vectorized = embeddings.vectorize(sentence, embeddings_dict)
    assert np.allclose(vectorized, corresponding_vector)

test_vectorize_non_dict_words()

In [None]:
# test vectorizing on two words, vector will have last matched word in sentence first
def test_vectorize_two_words_clean():
    sentence = "user use"
    corresponding_vector = np.array([use_vector, user_vector])
    vectorized = embeddings.vectorize(sentence, embeddings_dict)
    assert np.allclose(vectorized, corresponding_vector)
    
test_vectorize_two_words_clean()

In [None]:
# test vectorizing on three words
def test_vectorize_three_words_clean():
    sentence = "user use version"
    corresponding_vector = np.array([version_vector, use_vector, user_vector])
    vectorized = embeddings.vectorize(sentence, embeddings_dict)
    assert np.allclose(vectorized, corresponding_vector)
    
test_vectorize_three_words_clean()

In [None]:
# test vectorizing on two words mixed with words not in our mapping
def test_vectorize_two_words_mixed():
    sentence = "user is provided this new version of the product to do things"
    corresponding_vector = np.array([version_vector, user_vector])
    vectorized = embeddings.vectorize(sentence, embeddings_dict)
    assert np.allclose(vectorized, corresponding_vector)
    
test_vectorize_two_words_mixed()

In [None]:
# test vectorizing on two words mixed with words not in our mapping and cleaning users to use
def test_vectorize_two_words_mixed_with_cleaning():
    sentence = "users will use our product to do things"
    corresponding_vector = np.array([use_vector, user_vector])
    vectorized = embeddings.vectorize(sentence, embeddings_dict)
    assert np.allclose(vectorized, corresponding_vector)

test_vectorize_two_words_mixed_with_cleaning()

In [None]:
# test duplicated words
def test_vectorize_one_word_duplicated_three_times():
    sentence = "users user user"
    corresponding_vector = np.array([user_vector, user_vector, user_vector])
    vectorized = embeddings.vectorize(sentence, embeddings_dict)
    assert np.allclose(vectorized, corresponding_vector)

test_vectorize_one_word_duplicated_three_times()

In [None]:
# three words, users will clean to user and map to user
def test_three_words_mixed_with_cleaning():
    sentence = "users will use this version of our product"
    corresponding_vector = np.array([version_vector, use_vector, user_vector])
    vectorized = embeddings.vectorize(sentence, embeddings_dict)
    assert np.allclose(vectorized, corresponding_vector)
    
test_three_words_mixed_with_cleaning()

In [None]:
# test multiple words, with duplicate words, cleaning, and non dictionary words
def test_multi_words_with_dupes_with_cleaning():
    sentence = "Our product features several versions that the user can choose from. \
                Depending on which version they choose to use, the user will receive different features."
    corresponding_vector = np.array([user_vector, use_vector, version_vector, user_vector, version_vector])
    vectorized = embeddings.vectorize(sentence, embeddings_dict)
    assert np.allclose(vectorized, corresponding_vector)
    
test_multi_words_with_dupes_with_cleaning()