In [1]:
import numpy as np
import pandas as pd
import math

class HMMForLSTM:
    def __init__(self, obs_seq: str, word_to_idx: dict, vocab_size: int, num_hidden_states: int, A, B, pi):
        self.obs = []
        for word in obs_seq.split(' '):
            self.obs.append(word_to_idx[word])
        self.T = len(self.obs)
        self.N = num_hidden_states
        self.M = vocab_size
        self.alpha = np.zeros((self.T, self.N))
        self.beta = np.zeros((self.T, self.N))
        self.gammas = np.zeros((self.T, self.N))
        self.digammas = np.zeros((self.T, self.N, self.N))
        self.A = A
        self.B = B
        self.pi = pi
        self.c = np.zeros((self.T))
    
    def forward_algorithm(self):
        for i in range(self.N):
            self.alpha[0][i] = self.pi[i]*self.B[i][self.obs[0]]
            self.c[0] = self.c[0] + self.alpha[0][i]
        
        self.c[0] = (1/self.c[0]*1.0)
        
        for i in range(self.N):
            self.alpha[0][i] = self.c[0]*self.alpha[0][i]
        
        for t in range(1, self.T):
            for i in range(self.N):
                for j in range(self.N):
                    self.alpha[t][i] += self.alpha[t-1][j]*self.A[j][i]
                self.alpha[t][i] = self.alpha[t][i]*self.B[i][self.obs[t]]
                self.c[t] += self.alpha[t][i]
            
            self.c[t] = (1/self.c[t]*1.0)
            
            for i in range(self.N):
                self.alpha[t][i] = self.c[t]*self.alpha[t][i]
        
        return self.c
    
    def backward_algorithm(self):
        for i in range(self.N):
            self.beta[self.T - 1][i] = self.c[self.T-1]
        
        for t in reversed(range(self.T - 1)):
            for i in range(self.N):
                for j in range(self.N):
                    self.beta[t][i] = self.beta[t][i] + self.A[i][j]*self.B[j][self.obs[t+1]]*self.beta[t+1][j]
                self.beta[t][i] = self.c[t]*self.beta[t][i]
            
    
    def compute_gammas_and_digammas(self):
        for t in range(self.T - 1):
            denom = 0.0
            for i in range(self.N):
                for j in range(self.N):
                    denom = denom + self.alpha[t][i]*self.A[i][j]*self.B[j][self.obs[t+1]]*self.beta[t+1][j]
            for i in range(self.N):
                for j in range(self.N):
                    self.digammas[t][i][j] = (self.alpha[t][i]*self.A[i][j]*self.B[j][self.obs[t+1]]*self.beta[t+1][j])/denom
                    self.gammas[t][i] = self.gammas[t][i] + self.digammas[t][i][j]

        
        denom = 0.0
        for i in range(self.N):
            denom += self.alpha[self.T - 1][i]
        
        for i in range(self.N):
            self.gammas[self.T-1][i] = self.alpha[self.T - 1][i]/denom

    
    def reestimate_model(self):
        for i in range(self.N):
            self.pi[i] = self.gammas[0][i]
        
        for i in range(self.N):
            for j in range(self.N):
                numer = 0.0
                denom = 0.0
                for t in range(self.T - 1):
                    numer = numer + self.digammas[t][i][j]
                    denom = denom + self.gammas[t][i]
                
                A[i][j] = numer/denom
        
        for i in range(self.N):
            for j in range(self.M):
                numer = 0.0
                denom = 0.0
                for t in range(self.T):
                    if self.obs[t] == j:
                        numer += self.gammas[t][i]
                    denom += self.gammas[t][i]
                
                B[i][j] = numer/denom
    
    def compute_loss(self):
        logProb = 0.0
        for i in range(self.T):
            logProb += math.log(self.c[i])
        logProb = -1*logProb
        return logProb
    
    def train_iter(self, iter_num):
        print('{} iteration is running'.format(iter_num + 1))
        
        self.forward_algorithm()
        
        self.backward_algorithm()
        
        self.compute_gammas_and_digammas()
        
        self.reestimate_model()
        
        logProb = self.compute_loss()
        
        return logProb, self.A, self.B, self.pi
        
                    
    
        
        

In [2]:
data = pd.read_csv('malware_data.csv', index_col = 0)

In [3]:
data.head()

Unnamed: 0,OpCode Sequence,Malware Family
0,push mov push mov mov mov mov cmp jz push sti ...,CLUSTERclarkclark
1,push add mov push call test jz movzx mov add p...,ufasoftbitcoin
2,push push push call add push call mov push pus...,ufasoftbitcoin
3,push add mov push call test jz movzx mov add p...,ufasoftbitcoin
4,mov push mov push push mov and test push push ...,CLUSTERgdata


In [4]:
import pickle
with open('word2vec_embeddings.pkl', 'rb') as fp:
    embed = pickle.load(fp)

In [5]:
word_to_idx = dict()
index = 0
for word in embed:
    word_to_idx[word] = index
    index = index + 1

In [6]:
idx_to_word = dict([(value, key) for (key, value) in word_to_idx.items()])

In [7]:
import random
def make_matrix_non_uniform(mat):
    for i in range(len(mat)):
        n = len(mat[i])
        
        for j in range(0, n-1, 2):
            temp = random.uniform(0, 0.01)
            temp = temp*mat[i][j]
            mat[i][j] = mat[i][j] - temp
            mat[i][j+1] = mat[i][j+1] + temp
            
        np.random.shuffle(mat[i])
        row_sum = np.sum(mat[i])
        
        if row_sum != 1:
            mat[i][n-1] = mat[i][n-1] + 1 - row_sum
        

def make_vector_non_uniform(vec):
    n = len(vec)
    for i in range(0, n-1, 2):
        temp = random.uniform(0, 0.01)
        temp = temp*vec[i]
        vec[i] = vec[i] - temp
        vec[i+1] = vec[i+1] + temp
        
    np.random.shuffle(vec)
    vec_sum = np.sum(vec)
    
    if vec_sum != 1:
        vec[n-1] = vec[n-1] + 1 - vec_sum

In [8]:
num_hidden_states = 20
M = len(word_to_idx)

A = np.full((num_hidden_states, num_hidden_states), 1.0/num_hidden_states)
B = np.full((num_hidden_states, M), 1.0/M)
pi = np.full((num_hidden_states), 1.0/num_hidden_states)

In [9]:
make_matrix_non_uniform(A)
make_matrix_non_uniform(B)
make_vector_non_uniform(pi)

In [10]:
counts_family = data.groupby('Malware Family')['Malware Family'].transform(len)
mask = (counts_family > 50)

In [11]:
data = data[mask]

In [12]:
nr_categories = len(data["Malware Family"].unique())
print(nr_categories)

7


In [13]:
data["Malware Family"].unique()

array(['zeroaccess', 'winwebsec', 'securityshield', 'zbot', 'cridex',
       'smarthdd', 'harebot'], dtype=object)

In [15]:
data1 = data.loc[data['Malware Family'] == 'zbot']

In [16]:
for index, row in data1.iterrows():
    print('current observation number is {}'.format(index))
    obs_seq = row['OpCode Sequence']
    min_iters = 10
    oldLogProb = float('-inf')
    epsilon = 0.001
    delta = 0.0
    iters = 0
    while iters < min_iters or delta > epsilon:
        hmm = HMMForLSTM(obs_seq, word_to_idx, M, num_hidden_states, A, B, pi)
        logProb, A_new, B_new, pi_new = hmm.train_iter(iters)
        print('loss is {:.3f}'.format(logProb))
        A = A_new
        B = B_new
        pi = pi_new
        iters = iters + 1
        delta = abs(logProb - oldLogProb)
        oldLogProb = logProb

current observation number is 5827
1 iteration is running
loss is -5477.931
2 iteration is running
loss is -2762.164
3 iteration is running
loss is -2762.164
4 iteration is running
loss is -2762.164
5 iteration is running
loss is -2762.164
6 iteration is running
loss is -2762.163
7 iteration is running
loss is -2762.163
8 iteration is running
loss is -2762.163
9 iteration is running
loss is -2762.162
10 iteration is running
loss is -2762.160
11 iteration is running
loss is -2762.158
12 iteration is running
loss is -2762.154
13 iteration is running
loss is -2762.147
14 iteration is running
loss is -2762.135
15 iteration is running
loss is -2762.113
16 iteration is running


KeyboardInterrupt: 