In [1]:
import numpy as np
import pandas as pd

import pickle

In [2]:
A_matrices = dict()
B_matrices = dict()
pi_matrices = dict()

In [3]:
data = pd.read_csv('malware_data.csv', index_col = 0)

In [4]:
counts_family = data.groupby('Malware Family')['Malware Family'].transform(len)
mask = (counts_family > 50)
data = data[mask]
malware_families = data["Malware Family"].unique()

In [5]:
for i in range(len(malware_families)):
    with open('HMMMatrices/' + malware_families[i] + 'A.pkl', 'rb') as f:
        A_matrices[malware_families[i]] = pickle.load(f)
    with open('HMMMatrices/' + malware_families[i] + 'B.pkl', 'rb') as f:
        B_matrices[malware_families[i]] = pickle.load(f)
    with open('HMMMatrices/' + malware_families[i] + 'Pi.pkl', 'rb') as f:
        pi_matrices[malware_families[i]] = pickle.load(f)

In [6]:
data.head()

Unnamed: 0,OpCode Sequence,Malware Family
18,push mov sub mov mov and mov and mov inc mov c...,zeroaccess
19,push mov sub push push push mov mov mov mov mo...,zeroaccess
20,push mov sub jmp push mov jmp inc cli mov mov ...,zeroaccess
21,push mov push dec push push mov push call push...,zeroaccess
22,push mov sub push and jmp mov inc mov cmp jnb ...,zeroaccess


In [7]:
count = 0
for index, row in data.iterrows():
    if len(row['OpCode Sequence'].split(' ')) < 200:
        data = data.drop(index)
        count += 1
print(count)

26


In [8]:
import pickle
with open('word2vec_embeddings.pkl', 'rb') as fp:
    embed = pickle.load(fp)

word_to_idx = dict()
index = 0
for word in embed:
    word_to_idx[word] = index
    index = index + 1

In [9]:
class OptimalHiddenStateGetter:
    def __init__(self, obs_seq: str, word_to_idx: dict(), malware_family: str, vocab_size: int, num_hidden_states: int, A_matrices: dict(),
                B_matrices: dict(), pi_matrices: dict()):
        self.obs = []
        for word in obs_seq:
            self.obs.append(word_to_idx[word])
        self.T = len(self.obs)
        self.N = num_hidden_states
        self.M = vocab_size
        self.alpha = np.zeros((self.T, self.N))
        self.beta = np.zeros((self.T, self.N))
        self.gammas = np.zeros((self.T, self.N))
        self.digammas = np.zeros((self.T, self.N, self.N))
        self.A = A_matrices[malware_family]
        self.B = B_matrices[malware_family]
        self.pi = pi_matrices[malware_family]
        self.c = np.zeros((self.T))
        
    def forward_algorithm(self):
        for i in range(self.N):
            self.alpha[0][i] = self.pi[i]*self.B[i][self.obs[0]]
            self.c[0] = self.c[0] + self.alpha[0][i]
        
        self.c[0] = (1/self.c[0]*1.0)
        
        for i in range(self.N):
            self.alpha[0][i] = self.c[0]*self.alpha[0][i]
        
        for t in range(1, self.T):
            for i in range(self.N):
                for j in range(self.N):
                    self.alpha[t][i] += self.alpha[t-1][j]*self.A[j][i]
                self.alpha[t][i] = self.alpha[t][i]*self.B[i][self.obs[t]]
                self.c[t] += self.alpha[t][i]
            
            self.c[t] = (1/self.c[t]*1.0)
            
            for i in range(self.N):
                self.alpha[t][i] = self.c[t]*self.alpha[t][i]
        
        return self.c
    
    def backward_algorithm(self):
        for i in range(self.N):
            self.beta[self.T - 1][i] = self.c[self.T-1]
        
        for t in reversed(range(self.T - 1)):
            for i in range(self.N):
                for j in range(self.N):
                    self.beta[t][i] = self.beta[t][i] + self.A[i][j]*self.B[j][self.obs[t+1]]*self.beta[t+1][j]
                self.beta[t][i] = self.c[t]*self.beta[t][i]
                
    def find_optimal_hidden_state(self):
        print("running forward algorithm")
        self.forward_algorithm()
        print("running backward algorithm")
        self.backward_algorithm()
        print("finding optimal hidden state")
        hidden_state_seq = np.zeros((self.T))
        for t in range(self.T):
            max_prob = 0
            for i in range(self.N):
                if self.alpha[t][i]*self.beta[t][i] > max_prob:
                    max_prob = self.alpha[t][i]*self.beta[t][i]
                    hidden_state_seq[t] = i
        return hidden_state_seq
            

In [None]:
num_hidden_states = 20
hidden_state_vec = dict()
for index, row in data.iterrows():
    hidden_state_vec[index] = []
    for family in malware_families:
        print('Current index running is: {}, and malware family is {}'.format(index, family))
        obs_seq = row['OpCode Sequence'].split(' ')[0:200]
        hmm = OptimalHiddenStateGetter(obs_seq, word_to_idx, family, len(word_to_idx), num_hidden_states,
                                      A_matrices, B_matrices, pi_matrices)
        hidden_state_vec[index].append(hmm.find_optimal_hidden_state())

Current index running is: 18, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 18, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 18, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 18, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 18, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 18, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 18, and malware family is harebot
running forward algorithm
running backward algorithm


  self.c[t] = (1/self.c[t]*1.0)
  self.alpha[t][i] = self.c[t]*self.alpha[t][i]


finding optimal hidden state
Current index running is: 19, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 19, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 19, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 19, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 19, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 19, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 19, and malware family is harebot
running forward algorithm
running backward algorit

running backward algorithm
finding optimal hidden state
Current index running is: 27, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 27, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 27, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 27, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 27, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 28, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 28, and malware family is winwebsec
running forward algorit

finding optimal hidden state
Current index running is: 35, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 35, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 35, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 35, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 36, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 36, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 36, and malware family is securityshield
running forward algorithm
running backward algorit

running backward algorithm
finding optimal hidden state
Current index running is: 43, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 44, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 44, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 44, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 44, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 44, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 44, and malware family is smarthdd
running forward algorit

finding optimal hidden state
Current index running is: 52, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 52, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 52, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 52, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 52, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 52, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 52, and malware family is harebot
running forward algorithm
running backward algorit

running backward algorithm
finding optimal hidden state
Current index running is: 60, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 60, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 60, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 60, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 61, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 61, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 61, and malware family is securityshield
running forward algorit

running backward algorithm
finding optimal hidden state
Current index running is: 68, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 68, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 69, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 69, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 69, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 69, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 69, and malware family is cridex
running forward algorit

finding optimal hidden state
Current index running is: 76, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 77, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 77, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 77, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 77, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 77, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 77, and malware family is smarthdd
running forward algorithm
running backward algorit

finding optimal hidden state
Current index running is: 85, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 85, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 85, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 85, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 85, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 85, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 86, and malware family is zeroaccess
running forward algorithm
running backward algorit

running backward algorithm
finding optimal hidden state
Current index running is: 93, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 93, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 93, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 93, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 94, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 94, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 94, and malware family is securityshield
running forward algorit

running backward algorithm
finding optimal hidden state
Current index running is: 101, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 101, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 102, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 102, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 102, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 102, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 102, and malware family is cridex
running forward 

finding optimal hidden state
Current index running is: 109, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 110, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 110, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 110, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 110, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 110, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 110, and malware family is smarthdd
running forward algorithm
running backward 

running backward algorithm
finding optimal hidden state
Current index running is: 118, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 118, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 118, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 118, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 118, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 118, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 119, and malware family is zeroaccess
running forward 

running backward algorithm
finding optimal hidden state
Current index running is: 126, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 126, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 126, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 126, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 126, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 127, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 127, and malware family is winwebsec
running forward 

finding optimal hidden state
Current index running is: 134, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 134, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 134, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 134, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 135, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 135, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 135, and malware family is securityshield
running forward algorithm
running backward 

running backward algorithm
finding optimal hidden state
Current index running is: 142, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 142, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 142, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 143, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 143, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 143, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 143, and malware family is zbot
running forward 

running backward algorithm
finding optimal hidden state
Current index running is: 150, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 151, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 151, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 151, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 151, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 151, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 151, and malware family is smarthdd
running forward 

finding optimal hidden state
Current index running is: 159, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 159, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 159, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 159, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 159, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 159, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 159, and malware family is harebot
running forward algorithm
running backward 

running backward algorithm
finding optimal hidden state
Current index running is: 167, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 167, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 167, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 167, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 167, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 167, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 168, and malware family is zeroaccess
running forward 

running backward algorithm
finding optimal hidden state
Current index running is: 175, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 175, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 175, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 175, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 176, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 176, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 176, and malware family is securityshield
running forward 

running backward algorithm
finding optimal hidden state
Current index running is: 183, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 183, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 183, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 184, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 184, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 184, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 184, and malware family is zbot
running forward 

finding optimal hidden state
Current index running is: 191, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 191, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 192, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 192, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 192, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 192, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 192, and malware family is cridex
running forward algorithm
running backward 

finding optimal hidden state
Current index running is: 199, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 200, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 200, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 200, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 200, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 200, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 200, and malware family is smarthdd
running forward algorithm
running backward 

running backward algorithm
finding optimal hidden state
Current index running is: 208, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 208, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 208, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 208, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 208, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 208, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 209, and malware family is zeroaccess
running forward 

running backward algorithm
finding optimal hidden state
Current index running is: 216, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 216, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 216, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 216, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 217, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 217, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 217, and malware family is securityshield
running forward 

running backward algorithm
finding optimal hidden state
Current index running is: 224, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 224, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 224, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 225, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 225, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 225, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 225, and malware family is zbot
running forward 

running backward algorithm
finding optimal hidden state
Current index running is: 232, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 232, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 233, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 233, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 233, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 233, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 233, and malware family is cridex
running forward 

running backward algorithm
finding optimal hidden state
Current index running is: 240, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 241, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 241, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 241, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 241, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 241, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 241, and malware family is smarthdd
running forward 

running backward algorithm
finding optimal hidden state
Current index running is: 249, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 249, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 249, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 249, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 249, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 249, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 249, and malware family is harebot
running forward 

running backward algorithm
finding optimal hidden state
Current index running is: 257, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 257, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 257, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 257, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 257, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 257, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 258, and malware family is zeroaccess
running forward 

running backward algorithm
finding optimal hidden state
Current index running is: 265, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 265, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 265, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 265, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 265, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 266, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 266, and malware family is winwebsec
running forward 

finding optimal hidden state
Current index running is: 273, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 273, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 273, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 273, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 274, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 274, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 274, and malware family is securityshield
running forward algorithm
running backward 

finding optimal hidden state
Current index running is: 281, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 281, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 281, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 282, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 282, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 282, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 282, and malware family is zbot
running forward algorithm
running backward 

running backward algorithm
finding optimal hidden state
Current index running is: 289, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 289, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 290, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 290, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 290, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 290, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 290, and malware family is cridex
running forward 

running backward algorithm
finding optimal hidden state
Current index running is: 297, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 298, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 298, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 298, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 298, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 298, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 298, and malware family is smarthdd
running forward 

finding optimal hidden state
Current index running is: 306, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 306, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 306, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 306, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 306, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 306, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 307, and malware family is zeroaccess
running forward algorithm
running backward 

finding optimal hidden state
Current index running is: 314, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 314, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 314, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 314, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 314, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 315, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 315, and malware family is winwebsec
running forward algorithm
running backward 

running backward algorithm
finding optimal hidden state
Current index running is: 322, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 322, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 322, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 323, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 323, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 323, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 323, and malware family is zbot
running forward 

finding optimal hidden state
Current index running is: 330, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 330, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 331, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 331, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 331, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 331, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 331, and malware family is cridex
running forward algorithm
running backward 

finding optimal hidden state
Current index running is: 338, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 339, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 339, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 339, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 339, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 339, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 339, and malware family is smarthdd
running forward algorithm
running backward 

  self.beta[t][i] = self.beta[t][i] + self.A[i][j]*self.B[j][self.obs[t+1]]*self.beta[t+1][j]


finding optimal hidden state
Current index running is: 347, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 347, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 347, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 347, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 347, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 347, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 347, and malware family is harebot
running forward algorithm
running backward 

finding optimal hidden state
Current index running is: 355, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 355, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 355, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 355, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 355, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 355, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 356, and malware family is zeroaccess
running forward algorithm
running backward 

running backward algorithm
finding optimal hidden state
Current index running is: 363, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 363, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 363, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 363, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 364, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 364, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 364, and malware family is securityshield
running forward 

finding optimal hidden state
Current index running is: 371, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 371, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 371, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 372, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 372, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 372, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 372, and malware family is zbot
running forward algorithm
running backward 

finding optimal hidden state
Current index running is: 379, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 379, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 380, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 380, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 380, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 380, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 380, and malware family is cridex
running forward algorithm
running backward 

finding optimal hidden state
Current index running is: 387, and malware family is harebot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 388, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 388, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 388, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 388, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 388, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 388, and malware family is smarthdd
running forward algorithm
running backward 

finding optimal hidden state
Current index running is: 396, and malware family is zeroaccess
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 396, and malware family is winwebsec
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 396, and malware family is securityshield
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 396, and malware family is zbot
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 396, and malware family is cridex
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 396, and malware family is smarthdd
running forward algorithm
running backward algorithm
finding optimal hidden state
Current index running is: 396, and malware family is harebot
running forward algorithm
running backward 

In [11]:
X = []
for key in hidden_state_vec:
    hidden_state_seq = []
    for seq in hidden_state_vec[key]:
        hidden_state_seq = hidden_state_seq + seq.tolist()
    X.append(hidden_state_seq)

In [12]:
X = np.array(X)

In [13]:
X.shape

(8028, 1400)

In [14]:
factor = pd.factorize(data['Malware Family'])
y = factor[0]
definitions = factor[1]

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 42)

In [17]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators': [1, 10, 100], 'criterion': ('gini', 'entropy', 'log_loss'), 'max_features': ('sqrt', 'log2', None)}

rfc = RandomForestClassifier()
clf = GridSearchCV(rfc, parameters)

In [19]:
clf.fit(X_train, y_train)

45 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 450, in fit
    trees = Parallel(
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/opt/anaconda3/lib/p

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ('gini', 'entropy', 'log_loss'),
                         'max_features': ('sqrt', 'log2', None),
                         'n_estimators': [1, 10, 100]})

In [21]:
clf.score(X_test, y_test)

0.9682440846824408

In [22]:
clf.score(X_train, y_train)

0.9982871379632513