In [31]:
%run utils.ipynb 
%run models.ipynb

In [6]:
input_paras = {
    '#text-area-1' : "This is a paragraph that uses every single letter in the alphabet. Now, that does not mean this can be a paragraph with no story, but it does mean that every single letter is used. You can make it as generic or fanciful as you would like. You can talk about anything from quilts to jets to xylophones. Oh yeah, and you can use whatever language you want, from Afrikaans to Zulu.",
    '#text-area-2' : "One day, a zebra found a xylophone on the sidewalk. He quickly ran over, picked it up, and gave it to his pet mule. Just then, he found another xylophone. He kept that one for himself.",
    '#text-area-3' : "This is me trying to create a pangram for myself. A pangram is a sentence that contains all the characters from A-Z. Experimenting around such sentences might be useful for our project.",
    '#text-area-test' : "Well now that you are here already let me ask you this, do you think that the above paragraph is a pangram?"
}

sum([len(x) for x in input_paras.values()])

854

## Initialize 

- File paths and respective patterns to read set of all **input files**
- What kind of platforms to ignore, final structure of data, ignore users where data collected is less

In [7]:
path_pattern = {
    '../../git/KeystrokeDynamics/data/*' : r'dump\d+_\d+',
    '../Data/*' : r'dump\d+_\d+'
}

known_invalid_platforms = ['Linux aarch64','Linux armv8l']

features_final = ['name','device_type','false_character','hold_for','key_pressed','long_pressed_equivalent','platform','pressed_after','type_combination','effort','speed']

user_count_threshold = 600

In [8]:
df = get_valid_dataset(path_pattern, known_invalid_platforms, user_count_threshold)
print_data_summary(df)
df = get_unique_users_subset(df)
df.shape

******** Data at a high level ********
Number of users              :  21
Unique user_ids              :  23
Avg rows collected/user_id   :  936.96
Rows with false characters   : 33.15%
Avg long_pressed_equivalent  :  1.12
Rows with long_pressed > 1   : 10.16 %
*********************************************
*** 1 users with multiple user_ids, taking most significant instance for each user


(19871, 13)

## Prepare & Derive Features for Dataset

In [9]:
cond1 = df['key_pressed'].str.len() == 1
cond2 = df['long_pressed_equivalent'] == 1
conditions = cond1 & cond2
df = prepare_data(df, conditions)

********* DATA PREPARATION FOR FEATURE ENGINEERING *********
	RAW VS PREPARED SHAPES
(19871, 13)		(16280, 14)

----------
	CHARACTER TYPE DISTRIBUTION
          type
lower  49.057
upper  26.415
other  24.528
----------
	FALSE CHARACTER DISTRIBUTION
    false_character
f           65.743
t           34.257
*************************************************************


In [10]:
zipped_rows_2 = list(zip(df.values[:-1], df.values[1:]))
feature_df_2 = combine_characters_v1(zipped_rows_2, df.columns)

"""
zipped_rows_3 = list(zip(df.values[:-2], df.values[1:-1], df.values[2:]))
feature_df_3 = combine_characters_v1(zipped_rows_3)
"""

100%|██████████| 16279/16279 [01:20<00:00, 203.02it/s]


****** Skipped 104 rows as `speed` evaluates to `inf`


'\nzipped_rows_3 = list(zip(df.values[:-2], df.values[1:-1], df.values[2:]))\nfeature_df_3 = combine_characters_v1(zipped_rows_3)\n'

In [15]:
len(feature_df_2.name.unique()), len(feature_df_2.user_id.unique())

(21, 21)

In [16]:
df = feature_df_2[features_final]

In [17]:
# df.to_csv('../Data/fe_c2_04_12_2020_19_25.csv', index=False)

In [18]:
df.head(2)

Unnamed: 0,name,device_type,false_character,hold_for,key_pressed,long_pressed_equivalent,platform,pressed_after,type_combination,effort,speed
0,Varun Sapre,Computer/Laptop,ff,88,th,2,Win32,158,ll,168,0.15
0,Varun Sapre,Computer/Laptop,ff,97,hi,2,Win32,103,ll,120,0.0435


## Model Running

In [41]:
final_df = df.copy()

Instansiate two models, that are stratified and split, **balanced and unbalanced**

In [42]:
m = Models(final_df.copy(), 0.2, False)
m_b = Models(final_df.copy(), 0.2, True)

** Non-numeric columns encoded
** Unbalanced stratified per class train_test split
(12659, 10) (3165, 10) (12659,) (3165,)
** Non-numeric columns encoded
** Balanced stratified per class train_test split
(5476, 10) (1370, 10) (5476,) (1370,)


In [43]:
gnb, y_pred = m.NB()
m.accuracy_score(y_pred)

0.37

In [44]:
#m.confusion_matrix(y_pred)

In [45]:
gnb_b, y_pred_b = m_b.NB()
m_b.accuracy_score(y_pred_b)

0.36

In [46]:
logReg, y_pred = m.LR()
m.accuracy_score(y_pred)

0.14

In [47]:
logReg_b, y_pred_b = m_b.LR()
m_b.accuracy_score(y_pred_b)

0.12

In [48]:
rf, y_pred = m.RF()
m.accuracy_score(y_pred)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.4s


Average number of nodes 8073
Average maximum depth 32


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


0.59

In [49]:
rf_b, y_pred_b = m_b.RF()
m_b.accuracy_score(y_pred_b)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


Average number of nodes 3768
Average maximum depth 27


0.5