In [1]:
import os
import numpy as np
from sklearn import mixture
import pandas as pd
import re
import scipy.stats as stats
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [2]:
def ReadListOfTextFilesFromDirectoryWalk(path):
    """Read all text files from given directory and its sudirectories and return list of text files path"""
    files = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(path):
        for file in f:
            if '.txt' in file:
                files.append(os.path.join(r, file))
    return files


In [3]:
def CreateListOfTuplesFromFile(files):
#Reading file and create tuple 
    items = []
    for file in files:
        filename=file.split('\\')[-1].split('.')[0]
        f=open(file, "r")
        for line in f:
            words= line.rstrip('\n').split(sep=" ")
            items.append((words[0],words[1],words[2],filename))
    return items

In [4]:
#Creating dataframes
def CreateDataFrame(items,columns):
    """Taking list of tuples and return dataframe"""
    df = pd.DataFrame.from_records(items, columns=['Key', 'EventType','Time','User'])
    return df

In [5]:
def ParseAlphabetsKeys(df):
    """Return only alphabets records"""
    dfAlphabets=df[df["Key"].str.match('^.*[A-Z]$')]
    return dfAlphabets

In [6]:
def GetTimeDifferenceofKeyUpDown(data):
    prev=0
    rows=[]
    for index,row in data.iterrows():
        if prev==0 and row[1]=="KeyDown":
            prevRow=row
            prev=1
        elif row[1]=="KeyDown":
            rows.append([ prevRow[0], row[0], int(row[2])- int(prevRow[2]),row[3]])
            prevRow=row
            
        keyDownsList = []
        
    rows=[]
    for row in data:
        if row[1]=="KeyDown":
            keyDownsList.append(row)
        elif row[1]=="KeyUp":
            i=0
            for keyDown in keyDownsList:
                if row[0]==keyDown[0]:
                    rows.append([ keyDown[0], int(row[2])- int(keyDown[2]),row[3]])
                    del keyDownsList[i]
                    break
                i = i + 1
    return rows
    return rows

In [7]:
def GetTimeDifferenceofKeyDownDown(data):
    prev=0
    rows=[]
    for index,row in data.iterrows():
        if prev==0 and row[1]=="KeyDown":
            prevRow=row
            prev=1
        elif row[1]=="KeyDown":
            rows.append([ prevRow[0], row[0], int(row[2])- int(prevRow[2]),row[3]])
            prevRow=row
    return rows

In [8]:
def GetDictionaryOfFeatureVectors(keyDownDownTimeDifference):
    """
    Take List of arrays and return dictionary of 26*26 vector for each user
    """
    dictFV=dict()
    for xi in keyDownDownTimeDifference:
        if xi[3] not in dictFV:
            dictFV[xi[3]]=np.zeros(26*26,dtype=object)
            
        index=(ord(xi[0])-65)*26+np.absolute(ord(xi[1])-65)
        if dictFV[xi[3]][index]==0:
            dictFV[xi[3]][index]=[]
        if xi[2] < 1000:
            dictFV[xi[3]][index].append(xi[2])
    return dictFV

In [9]:
def GetGMMFormatDataFromDataFrame(usersFV):
    GMMData=[]
    for user in usersFV.columns:
        maxLength=0
        for userF in usersFV[user]:
            if userF==0:
                userF=[]
            featureLenght=len(userF)
            if featureLenght > maxLength:
                maxLength=featureLenght
        userDF=pd.DataFrame(index=range(0,676))
        userData=np.zeros((maxLength,676))
        row=0
        for userF in usersFV[user]:
            column=0
            if userF==0:
                userF=[]
            availableValuesCount=len(userF)
            sum=0
            for value in userF:
                userData[column][row]=value
                column=column+1
                sum=sum+value
            if availableValuesCount==0:
                mean=0
            else:
                mean=sum/availableValuesCount
            gussianValues=np.random.normal(mean,3,maxLength-availableValuesCount)
            for value in gussianValues:
            
                userData[column][row]=value
                column=column+1
        
            row=row+1
        print(userData.shape)
        if len(GMMData)==0:
            GMMData=userData
        else:
            GMMData=np.append(GMMData,userData,axis=0)
    return GMMData


In [10]:
def GetAverageFeatureVectors(FeatureVectors):
    
    averageFV= np.zeros(26*26,dtype=object)
    index=0
    for xi in FeatureVectors:
        tempSum=np.array(xi).sum()
        if type(xi)== list:
            averageFV[index]=tempSum/len(xi)
        index=index+1
    return averageFV

In [11]:
def TrainModel(path):
    """Get path of data files and train a GMM model"""
    files=ReadListOfTextFilesFromDirectoryWalk(path)
    # I am training on first 10 users
    dataFileFormat=CreateListOfTuplesFromFile(files[0:6:2])
    #print(dataFileFormat[0])
    df=CreateDataFrame(dataFileFormat,columns=['Key', 'EventType','Time','User'])
    alphabetsDF=ParseAlphabetsKeys(df)
    keyDownDownTimeDifference=GetTimeDifferenceofKeyDownDown(data=alphabetsDF)
    FeatureVectors=GetDictionaryOfFeatureVectors(keyDownDownTimeDifference) 
    usersFV=pd.DataFrame.from_dict(FeatureVectors)
    data= GetGMMFormatDataFromDataFrame(usersFV)
    g = mixture.GMM(n_components=3)
    print(len(data))
    g.fit(data)
    return g
    

In [12]:
def TestModel(path,g):
    """Get path of data files and test a GMM model"""
    GMMData=[]
    files=ReadListOfTextFilesFromDirectoryWalk(path)
    dataFileFormat=CreateListOfTuplesFromFile(files[0:6:2])
    df=CreateDataFrame(dataFileFormat,columns=['Key', 'EventType','Time','User'])
    
    alphabetsDF=ParseAlphabetsKeys(df)
    keyDownDownTimeDifference=GetTimeDifferenceofKeyDownDown(data=alphabetsDF)
    FeatureVectors=GetDictionaryOfFeatureVectors(keyDownDownTimeDifference) 
    usersFV=pd.DataFrame.from_dict(FeatureVectors)
    for user in FeatureVectors:
        print(user)
        data=GetAverageFeatureVectors(FeatureVectors[user])
        dataNP=np.array(data,dtype=np.float64)
        dataNP[np.isnan(dataNP)]=0
        dataNP=dataNP.reshape((1,676))
        
        
        if len(GMMData)==0:
            GMMData=dataNP
        else:
            GMMData=np.append(GMMData,dataNP,axis=0)
    print(GMMData.shape)
    print(len(GMMData))
    
    print(g.predict_proba(GMMData))
    return g.predict(GMMData)

In [13]:
GMM= TrainModel("..\\UB_keystroke_dataset\\S0")
GMM.weights_

(65, 676)
(65, 676)
(65, 676)
195




array([0.33333333, 0.33333333, 0.33333333])

In [14]:
predictions=TestModel("..\\UB_keystroke_dataset\\S1",GMM)

001100
002100
003100
(3, 676)
3
[[0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


  


In [15]:
predictions

array([1, 1, 0], dtype=int64)

In [16]:
def GetKDEFormatDataFromDataFrame(usersFV):
    GMMData=[]
    TargetUser=[]
    for user in usersFV.columns:
        print(user)
        maxLength=0
        for userF in usersFV[user]:
            if userF==0:
                userF=[]
            featureLenght=len(userF)
            if featureLenght > maxLength:
                maxLength=featureLenght
        userDF=pd.DataFrame(index=range(0,676))
        userData=np.zeros((maxLength,676))
        a=np.empty(maxLength)
        a.fill(user)
        if len(TargetUser) < 1:
            TargetUser=a
        else:
            TargetUser=np.concatenate((TargetUser,a))
        row=0
        for userF in usersFV[user]:
            column=0
            if userF==0:
                userF=[]
            availableValuesCount=len(userF)
            sum=0
            for value in userF:
                userData[column][row]=value
                column=column+1
                sum=sum+value
            if availableValuesCount==0:
                mean=0
            else:
                mean=sum/availableValuesCount
            gussianValues=np.random.normal(mean,3,maxLength-availableValuesCount)
            for value in gussianValues:
            
                userData[column][row]=value
                column=column+1
        
            row=row+1
        print(userData.shape)
        if len(GMMData)==0:
            GMMData=userData
        else:
            GMMData=np.append(GMMData,userData,axis=0)
            
        
    return GMMData,TargetUser

In [17]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import KernelDensity
from statsmodels.nonparametric.kernel_density import KDEMultivariate

class KDEClassifier(BaseEstimator, ClassifierMixin):
    """Bayesian generative classification based on KDE
    
    Parameters
    ----------
    bandwidth : float
        the kernel bandwidth within each class
    kernel : str
        the kernel name, passed to KernelDensity
    """
    def __init__(self, bandwidth=1.0, kernel='gaussian'):
        self.bandwidth = bandwidth
        self.kernel = kernel
        
    def fit(self, X, y):
        self.classes_ = np.sort(np.unique(y))
        training_sets = [X[y == yi] for yi in self.classes_]
        self.models_ = [KernelDensity(bandwidth=self.bandwidth,
                                      kernel=self.kernel).fit(Xi)
                        for Xi in training_sets]
        self.logpriors_ = [np.log(Xi.shape[0] / X.shape[0])
                           for Xi in training_sets]
        return self
        
    def predict_proba(self, X):
        logprobs = np.array([model.score_samples(X)
                             for model in self.models_]).T
        result = np.exp(logprobs + self.logpriors_)
        return result / result.sum(1, keepdims=True)
        
    def predict(self, X):
        return self.classes_[np.argmax(self.predict_proba(X), 1)]

In [18]:
files=ReadListOfTextFilesFromDirectoryWalk("..\\UB_keystroke_dataset\\S1")
# I am training on first 10 users
dataFileFormat=CreateListOfTuplesFromFile(files[0:6:2])
#print(dataFileFormat[0])
df=CreateDataFrame(dataFileFormat,columns=['Key', 'EventType','Time','User'])
alphabetsDF=ParseAlphabetsKeys(df)
keyDownDownTimeDifference=GetTimeDifferenceofKeyDownDown(data=alphabetsDF)
FeatureVectors=GetDictionaryOfFeatureVectors(keyDownDownTimeDifference) 
usersFV=pd.DataFrame.from_dict(FeatureVectors)
X,y=GetKDEFormatDataFromDataFrame(usersFV)
KDE=KDEClassifier()
KDE.fit(X,y)
KDE.predict_proba(X)


001100
(62, 676)
002100
(57, 676)
003100
(57, 676)


array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0

In [19]:
files=ReadListOfTextFilesFromDirectoryWalk("..\\UB_keystroke_dataset\\S1")
# I am training on first 10 users
dataFileFormat=CreateListOfTuplesFromFile(files[0:6:2])
#print(dataFileFormat[0])
df=CreateDataFrame(dataFileFormat,columns=['Key', 'EventType','Time','User'])
alphabetsDF=ParseAlphabetsKeys(df)
keyDownDownTimeDifference=GetTimeDifferenceofKeyDownDown(data=alphabetsDF)
FeatureVectors=GetDictionaryOfFeatureVectors(keyDownDownTimeDifference) 
usersFV=pd.DataFrame.from_dict(FeatureVectors)
X_test,y=GetKDEFormatDataFromDataFrame(usersFV)
KDE.predict(X_test)


001100
(62, 676)
002100
(57, 676)
003100
(57, 676)




array([1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100., 1100.,
       1100., 1100.,

In [22]:
path = "..\\UB_keystroke_dataset\\S0"
files=ReadListOfTextFilesFromDirectoryWalk(path)
# I am training on first 10 users
dataFileFormat=CreateListOfTuplesFromFile(files[0:1])
#print(dataFileFormat[0])
df=CreateDataFrame(dataFileFormat,columns=['Key', 'EventType','Time','User'])
alphabetsDF=ParseAlphabetsKeys(df)
keyDownDownTimeDifference=GetTimeDifferenceofKeyDownDown(data=alphabetsDF)



In [25]:
FeatureVectors=GetDictionaryOfFeatureVectors(keyDownDownTimeDifference) 
usersFV=pd.DataFrame.from_dict(FeatureVectors)
data= GetGMMFormatDataFromDataFrame(usersFV)
#g = mixture.GMM(n_components=3)
print(len(data))
#g.fit(data)

(65, 676)
65


In [42]:
FeatureVec = dict()
for keyDDTime in keyDownDownTimeDifference:
    keyDD = keyDDTime[0] + keyDDTime[1]
    if keyDD not in FeatureVec:
        FeatureVec[keyDD] = []
    FeatureVec[keyDD].append(keyDDTime[2])
        


In [60]:
count = 0
for key, value in FeatureVec.items():
    #print (key + " - " + str(len(value)))
    print(value)
    if len(value) > 9:
        count = count +1
    
print(count)


[359]
[296, 202, 141, 62, 140]
[359, 1186]
[140, 109, 78, 140, 78, 62, 281, 63]
[141, 62, 63, 187, 93, 63, 47, 78, 32, 47, 63, 78, 47, 109, 109, 47, 62, 62, 63, 109, 94, 31, 78, 62, 94, 46, 47, 31, 46, 32]
[296, 702, 218, 94, 141, 156, 874, 93, 187, 187, 125, 156, 62, 843, 109, 62, 140, 171]
[2980, 889, 421, 874, 249, 343, 375]
[62, 78, 109, 141, 140, 266, 94, 63, 63, 140, 265, 62, 93, 78, 156, 140, 265, 141, 140, 110, 717, 124, 141]
[78, 110, 171, 234, 78, 47, 62, 110, 31, 234, 93, 936, 78, 63, 234, 78, 47, 125, 140, 390, 94, 47, 47, 172, 546, 125, 62, 94, 234, 62, 187, 140, 109, 1575, 920, 62, 47, 1794, 63]
[515, 234, 218, 281, 203, 1061, 203, 265, 203, 281, 203, 234, 203, 218, 561, 203, 250, 219, 203, 218, 187, 203, 203, 172, 234, 218, 234, 203, 187, 187, 203, 2246, 234, 234, 203, 187, 219, 203, 203, 265, 187, 187, 141]
[281, 1404, 219, 249, 561, 390, 1326, 203, 281, 203, 218, 2122, 281]
[187, 187, 140, 141, 172, 140, 125, 125, 141, 998, 140, 156, 110, 124, 1825, 109, 156, 156, 328,

In [56]:
len("CoC_Bulk_Notification_Student_Population_Student_Admission_Status_Criteria") *2

148

In [70]:
FeatureGMMdic = dict()
for key, value in FeatureVec.items():
    #print (key + " - " + str(len(value)))
    if len(value) > 9:
        #train GMM
        #FeatureGMMdic[key] = GMM Mode
        gmm = mixture.GMM(n_components=1, covariance_type='spherical')
        
        gmm.fit(np.array(value).reshape(-1,1))
        FeatureGMMdic[key] = gmm
        












In [71]:
FeatureGMMdic

{'ON': GMM(covariance_type='spherical', init_params='wmc', min_covar=0.001,
   n_components=1, n_init=1, n_iter=100, params='wmc', random_state=None,
   tol=0.001, verbose=0),
 'NE': GMM(covariance_type='spherical', init_params='wmc', min_covar=0.001,
   n_components=1, n_init=1, n_iter=100, params='wmc', random_state=None,
   tol=0.001, verbose=0),
 'OR': GMM(covariance_type='spherical', init_params='wmc', min_covar=0.001,
   n_components=1, n_init=1, n_iter=100, params='wmc', random_state=None,
   tol=0.001, verbose=0),
 'RE': GMM(covariance_type='spherical', init_params='wmc', min_covar=0.001,
   n_components=1, n_init=1, n_iter=100, params='wmc', random_state=None,
   tol=0.001, verbose=0),
 'ED': GMM(covariance_type='spherical', init_params='wmc', min_covar=0.001,
   n_components=1, n_init=1, n_iter=100, params='wmc', random_state=None,
   tol=0.001, verbose=0),
 'DT': GMM(covariance_type='spherical', init_params='wmc', min_covar=0.001,
   n_components=1, n_init=1, n_iter=100, par