In [1]:
import os
import numpy as np
from sklearn import mixture
import pandas as pd
import re
import scipy.stats as stats
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [2]:
def ReadListOfTextFilesFromDirectoryWalk(path):
    """Read all text files from given directory and its sudirectories and return list of text files path"""
    files = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(path):
        for file in f:
            if '.txt' in file:
                files.append(os.path.join(r, file))
    return files


In [3]:
def CreateListOfTuplesFromFile(files):
#Reading file and create tuple 
    items = []
    for file in files:
        filename=file.split('\\')[-1].split('.')[0]
        f=open(file, "r")
        for line in f:
            words= line.rstrip('\n').split(sep=" ")
            items.append((words[0],words[1],words[2],filename))
    return items

In [75]:
def CreateListFromFile(file):
#Reading file and create tuple 
    items = []
#    iterator = (files,) if not isinstance(files, (tuple, list)) else files
#    for file in iterator:
    filename=file.split('\\')[-1].split('.')[0]
    f=open(file, "r")
    for line in f:
        words= line.rstrip('\n').split(sep=" ")
        items.append((words[0],words[1],words[2],filename))
    f.close()
    return items

In [4]:
#Creating dataframes
def CreateDataFrame(items,columns):
    """Taking list of tuples and return dataframe"""
    df = pd.DataFrame.from_records(items, columns=['Key', 'EventType','Time','User'])
    return df

In [5]:
def ParseAlphabetsKeys(df):
    """Return only alphabets records"""
    dfAlphabets=df[df["Key"].str.match('^.*[A-Z]$')]
    return dfAlphabets

In [6]:
def GetTimeDifferenceofKeyUpDown(data):
    prev=0
    rows=[]
    for index,row in data.iterrows():
        if prev==0 and row[1]=="KeyDown":
            prevRow=row
            prev=1
        elif row[1]=="KeyDown":
            rows.append([ prevRow[0], row[0], int(row[2])- int(prevRow[2]),row[3]])
            prevRow=row
            
        keyDownsList = []
        
    rows=[]
    for row in data:
        if row[1]=="KeyDown":
            keyDownsList.append(row)
        elif row[1]=="KeyUp":
            i=0
            for keyDown in keyDownsList:
                if row[0]==keyDown[0]:
                    rows.append([ keyDown[0], int(row[2])- int(keyDown[2]),row[3]])
                    del keyDownsList[i]
                    break
                i = i + 1
    return rows
    return rows

In [7]:
def GetTimeDifferenceofKeyDownDown(data):
    prev=0
    rows=[]
    for index,row in data.iterrows():
        if prev==0 and row[1]=="KeyDown":
            prevRow=row
            prev=1
        elif row[1]=="KeyDown":
            rows.append([ prevRow[0], row[0], int(row[2])- int(prevRow[2]),row[3]])
            prevRow=row
    return rows

In [8]:
def GetDictionaryOfFeatureVectors(keyDownDownTimeDifference):
    """
    Take List of arrays and return dictionary of 26*26 vector for each user
    """
    dictFV=dict()
    for xi in keyDownDownTimeDifference:
        if xi[3] not in dictFV:
            dictFV[xi[3]]=np.zeros(26*26,dtype=object)
            
        index=(ord(xi[0])-65)*26+np.absolute(ord(xi[1])-65)
        if dictFV[xi[3]][index]==0:
            dictFV[xi[3]][index]=[]
        if xi[2] < 1000:
            dictFV[xi[3]][index].append(xi[2])
    return dictFV

In [9]:
def GetGMMFormatDataFromDataFrame(usersFV):
    GMMData=[]
    for user in usersFV.columns:
        maxLength=0
        for userF in usersFV[user]:
            if userF==0:
                userF=[]
            featureLenght=len(userF)
            if featureLenght > maxLength:
                maxLength=featureLenght
        userDF=pd.DataFrame(index=range(0,676))
        userData=np.zeros((maxLength,676))
        row=0
        for userF in usersFV[user]:
            column=0
            if userF==0:
                userF=[]
            availableValuesCount=len(userF)
            sum=0
            for value in userF:
                userData[column][row]=value
                column=column+1
                sum=sum+value
            if availableValuesCount==0:
                mean=0
            else:
                mean=sum/availableValuesCount
            gussianValues=np.random.normal(mean,3,maxLength-availableValuesCount)
            for value in gussianValues:
            
                userData[column][row]=value
                column=column+1
        
            row=row+1
        print(userData.shape)
        if len(GMMData)==0:
            GMMData=userData
        else:
            GMMData=np.append(GMMData,userData,axis=0)
    return GMMData


In [10]:
def GetAverageFeatureVectors(FeatureVectors):
    
    averageFV= np.zeros(26*26,dtype=object)
    index=0
    for xi in FeatureVectors:
        tempSum=np.array(xi).sum()
        if type(xi)== list:
            averageFV[index]=tempSum/len(xi)
        index=index+1
    return averageFV

In [11]:
def TrainModel(path):
    """Get path of data files and train a GMM model"""
    files=ReadListOfTextFilesFromDirectoryWalk(path)
    # I am training on first 10 users
    dataFileFormat=CreateListOfTuplesFromFile(files[0:6:2])
    #print(dataFileFormat[0])
    df=CreateDataFrame(dataFileFormat,columns=['Key', 'EventType','Time','User'])
    alphabetsDF=ParseAlphabetsKeys(df)
    keyDownDownTimeDifference=GetTimeDifferenceofKeyDownDown(data=alphabetsDF)
    FeatureVectors=GetDictionaryOfFeatureVectors(keyDownDownTimeDifference) 
    usersFV=pd.DataFrame.from_dict(FeatureVectors)
    data= GetGMMFormatDataFromDataFrame(usersFV)
    g = mixture.GMM(n_components=3)
    print(len(data))
    g.fit(data)
    return g
    

In [12]:
def TestModel(path,g):
    """Get path of data files and test a GMM model"""
    GMMData=[]
    files=ReadListOfTextFilesFromDirectoryWalk(path)
    dataFileFormat=CreateListOfTuplesFromFile(files[0:6:2])
    df=CreateDataFrame(dataFileFormat,columns=['Key', 'EventType','Time','User'])
    
    alphabetsDF=ParseAlphabetsKeys(df)
    keyDownDownTimeDifference=GetTimeDifferenceofKeyDownDown(data=alphabetsDF)
    FeatureVectors=GetDictionaryOfFeatureVectors(keyDownDownTimeDifference) 
    usersFV=pd.DataFrame.from_dict(FeatureVectors)
    for user in FeatureVectors:
        print(user)
        data=GetAverageFeatureVectors(FeatureVectors[user])
        dataNP=np.array(data,dtype=np.float64)
        dataNP[np.isnan(dataNP)]=0
        dataNP=dataNP.reshape((1,676))
        
        
        if len(GMMData)==0:
            GMMData=dataNP
        else:
            GMMData=np.append(GMMData,dataNP,axis=0)
    print(GMMData.shape)
    print(len(GMMData))
    
    print(g.predict_proba(GMMData))
    return g.predict(GMMData)

In [13]:
GMM= TrainModel("..\\UB_keystroke_dataset\\S0")
GMM.weights_

(65, 676)
(65, 676)
(65, 676)
195




array([0.33333333, 0.33333333, 0.33333333])

In [14]:
predictions=TestModel("..\\UB_keystroke_dataset\\S1",GMM)

001100
002100
003100
(3, 676)
3
[[0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


  


In [15]:
predictions

array([1, 1, 0], dtype=int64)

In [16]:
def GetKDEFormatDataFromDataFrame(usersFV):
    GMMData=[]
    TargetUser=[]
    for user in usersFV.columns:
        print(user)
        maxLength=0
        for userF in usersFV[user]:
            if userF==0:
                userF=[]
            featureLenght=len(userF)
            if featureLenght > maxLength:
                maxLength=featureLenght
        userDF=pd.DataFrame(index=range(0,676))
        userData=np.zeros((maxLength,676))
        a=np.empty(maxLength)
        a.fill(user)
        if len(TargetUser) < 1:
            TargetUser=a
        else:
            TargetUser=np.concatenate((TargetUser,a))
        row=0
        for userF in usersFV[user]:
            column=0
            if userF==0:
                userF=[]
            availableValuesCount=len(userF)
            sum=0
            for value in userF:
                userData[column][row]=value
                column=column+1
                sum=sum+value
            if availableValuesCount==0:
                mean=0
            else:
                mean=sum/availableValuesCount
            gussianValues=np.random.normal(mean,3,maxLength-availableValuesCount)
            for value in gussianValues:
            
                userData[column][row]=value
                column=column+1
        
            row=row+1
        print(userData.shape)
        if len(GMMData)==0:
            GMMData=userData
        else:
            GMMData=np.append(GMMData,userData,axis=0)
            
        
    return GMMData,TargetUser

In [17]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import KernelDensity
from statsmodels.nonparametric.kernel_density import KDEMultivariate

class KDEClassifier(BaseEstimator, ClassifierMixin):
    """Bayesian generative classification based on KDE
    
    Parameters
    ----------
    bandwidth : float
        the kernel bandwidth within each class
    kernel : str
        the kernel name, passed to KernelDensity
    """
    def __init__(self, bandwidth=1.0, kernel='gaussian'):
        self.bandwidth = bandwidth
        self.kernel = kernel
        
    def fit(self, X, y):
        self.classes_ = np.sort(np.unique(y))
        training_sets = [X[y == yi] for yi in self.classes_]
        self.models_ = [KernelDensity(bandwidth=self.bandwidth,
                                      kernel=self.kernel).fit(Xi)
                        for Xi in training_sets]
        self.logpriors_ = [np.log(Xi.shape[0] / X.shape[0])
                           for Xi in training_sets]
        return self
        
    def predict_proba(self, X):
        logprobs = np.array([model.score_samples(X)
                             for model in self.models_]).T
        result = np.exp(logprobs + self.logpriors_)
        return result / result.sum(1, keepdims=True)
        
    def predict(self, X):
        return self.classes_[np.argmax(self.predict_proba(X), 1)]

In [18]:
files=ReadListOfTextFilesFromDirectoryWalk("..\\UB_keystroke_dataset\\S1")
# I am training on first 10 users
dataFileFormat=CreateListOfTuplesFromFile(files[0:6:2])
#print(dataFileFormat[0])
df=CreateDataFrame(dataFileFormat,columns=['Key', 'EventType','Time','User'])
alphabetsDF=ParseAlphabetsKeys(df)
keyDownDownTimeDifference=GetTimeDifferenceofKeyDownDown(data=alphabetsDF)
FeatureVectors=GetDictionaryOfFeatureVectors(keyDownDownTimeDifference) 
usersFV=pd.DataFrame.from_dict(FeatureVectors)
X,y=GetKDEFormatDataFromDataFrame(usersFV)
KDE=KDEClassifier()
KDE.fit(X,y)
KDE.predict_proba(X)


001100
(62, 676)
002100
(57, 676)
003100
(57, 676)


array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0

In [19]:
files=ReadListOfTextFilesFromDirectoryWalk("..\\UB_keystroke_dataset\\S1")
# I am training on first 10 users
dataFileFormat=CreateListOfTuplesFromFile(files[0:6:2])
#print(dataFileFormat[0])
df=CreateDataFrame(dataFileFormat,columns=['Key', 'EventType','Time','User'])
alphabetsDF=ParseAlphabetsKeys(df)
keyDownDownTimeDifference=GetTimeDifferenceofKeyDownDown(data=alphabetsDF)
FeatureVectors=GetDictionaryOfFeatureVectors(keyDownDownTimeDifference) 
usersFV=pd.DataFrame.from_dict(FeatureVectors)
X_test,y=GetKDEFormatDataFromDataFrame(usersFV)
KDE.predict(X_test)


001100
(62, 676)
002100
(57, 676)
003100
(57, 676)




array([1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100.,

In [97]:
def GetDictOfDiagraphs(keyDownDownTimeDifference):
    feature_vec = dict()
    for key_DD_time in keyDownDownTimeDifference:
        key_DD = key_DD_time[0] + key_DD_time[1]
        if key_DD not in feature_vec:
            feature_vec[key_DD] = []
        feature_vec[key_DD].append(key_DD_time[2])
    return feature_vec

In [98]:
def GetDictOfGMMs(feature_vec):
    dict_gmms = dict()
    for key, value in feature_vec.items():
        #print (key + " - " + str(len(value)))
        if len(value) > 9:
            #train GMM
            #FeatureGMMdic[key] = GMM Mode
            gmm = mixture.GaussianMixture(n_components=1, covariance_type='spherical')

            gmm.fit(np.array(value).reshape(-1,1))
            dict_gmms[key] = gmm
    return dict_gmms


In [99]:
def TrainModelforEachDiagraph(user):
    dataFileFormat=CreateListFromFile(user)
    #print(dataFileFormat[0])
    df=CreateDataFrame(dataFileFormat,columns=['Key', 'EventType','Time','User'])
    alphabetsDF=ParseAlphabetsKeys(df)
    keyDownDownTimeDifference=GetTimeDifferenceofKeyDownDown(data=alphabetsDF)
    
    return GetDictOfGMMs(GetDictOfDiagraphs(keyDownDownTimeDifference))

In [126]:
def TestModelforEachDiagraph(user, dict_gmms):
    dataFileFormat=CreateListFromFile(user)
    df=CreateDataFrame(dataFileFormat,columns=['Key', 'EventType','Time','User'])
    alphabetsDF=ParseAlphabetsKeys(df)
    keyDownDownTimeDifference=GetTimeDifferenceofKeyDownDown(data=alphabetsDF)
    #print (GetDictOfDiagraphs(keyDownDownTimeDifference))
    for key, value in GetDictOfDiagraphs(keyDownDownTimeDifference).items():
        if key in dict_gmms and len(value) > 9:
            print(dict_gmms[key].score(np.array(value).reshape(-1,1)))

In [111]:
path = "..\\UB_keystroke_dataset\\S0"
users=ReadListOfTextFilesFromDirectoryWalk(path)

diagraps_gmms = TrainModelforEachDiagraph(users[0])
#diagraps_gmms

In [128]:
TestModelforEachDiagraph(users[6],diagraps_gmms)

-104.23862759247639
-6.760980977428194
-6.8174054099162245
-7.193345086861418
-6.903618620738496
-7.576416676409476
-6.926767302998269
-7.214633397714556
-6.3478029449399305
-7.247702955179273
-6.380801912206913
-5.8300183883013315
-7.622594942359484
-5.87642620020031
-5.364325150006197
-6.149095329464601
-5.4964308365100525
-6.1898654734225165
-6.856193049900927
-6.178947282329126
-6.960858808095388
-6.632949026851759
-6.720055891465157
-6.910542356964444
-7.644731207881364
-5.490434900343557
-10.093168743625723
-8.96632051388883
-11.017851765571965
-5.796038927700559
-7.17292003303358
-31.257288246598392
-7.605045439708615
-5.870773174970111
-13.181258784678333
-5.646672643012161
-7.18025078139989
-21.169112182298033
-7.233822232462447
-5.479294232246047
-7.3956916322255655
-6.3190571566138765
-7.306617315213329
-90.57369880944525
-7.840823151823784
-12.00979880811829
-6.144023583122098
-28.262586564537305
-7.350894547226071
-6.936640270194664
-6.513974942067512
-7.626217551705693
-7