In [2]:
import pickle
import sys


In [3]:
# process the normal registry data
normal_file_path = './dataset/normal_registry_total.pkl'
with open(normal_file_path, 'rb') as fr:
    normal_data = pickle.load(fr, encoding="bytes")

# process the ransom registry data
ransom_file_path = './dataset/ransom_registry_total.pkl'
with open(ransom_file_path, 'rb') as fr:
    ransom_data = pickle.load(fr, encoding="bytes")

In [4]:
# Preprocessing for lowercase
import numpy as np

# lowercase all the data
normal_data = np.char.lower(normal_data)
ransom_data = np.char.lower(ransom_data)

In [5]:
# calculate the weights
from collections import Counter

def string_frequency(strings):
    total_len = len(strings)
    weights_dict = {}
    strings_dict = Counter(strings)
    print(len(strings_dict.keys()))
    for key, value in strings_dict.items():
        weights_dict[key] = round(value/total_len, 6)
    return weights_dict


In [6]:
normal_weights_dict = string_frequency(normal_data)
ransom_weights_dict = string_frequency(ransom_data)

4685
6452


In [7]:
# sort the dict according to the weights
normal_query_weights = sorted(normal_weights_dict.items(), key=lambda x: x[1], reverse=True)
ransom_query_weights = sorted(ransom_weights_dict.items(), key=lambda x: x[1], reverse=True)

In [8]:
normal_query_weights[:50]

[('hkey_local_machine\\software\\classes\\allfilesystemobjects\\docobject',
  0.000953),
 ('hkey_local_machine\\software\\classes\\directory\\browseinplace', 0.000953),
 ('hkey_local_machine\\software\\classes\\directory\\docobject', 0.000953),
 ('hkey_local_machine\\software\\classes\\allfilesystemobjects\\browseinplace',
  0.000953),
 ('hkey_local_machine\\software\\classes\\folder\\docobject', 0.000953),
 ('hkey_local_machine\\software\\classes\\folder\\browseinplace', 0.000953),
 ('hkey_local_machine\\software\\classes\\unknown\\browseinplace', 0.000901),
 ('hkey_local_machine\\software\\classes\\unknown\\docobject', 0.000901),
 ('hkey_current_user\\local settings\\software\\microsoft\\windows\\shell\\muicache\\c:\\program files (x86)\\adobe\\reader 11.0\\reader\\acrord32.exe',
  0.000901),
 ('hkey_local_machine\\software\\classes\\.bin\\docobject', 0.000857),
 ('hkey_local_machine\\software\\classes\\.bin\\browseinplace', 0.000857),
 ('hkey_current_user\\software\\microsoft\\windo

In [9]:
ransom_query_weights[:50]

[('hkey_local_machine\\software\\microsoft\\windows nt\\currentversion\\gre_initialize\\disablemetafiles',
  0.001185),
 ('hkey_local_machine\\software\\classes\\allfilesystemobjects\\browseinplace',
  0.000853),
 ('hkey_local_machine\\software\\classes\\allfilesystemobjects\\docobject',
  0.000853),
 ('hkey_local_machine\\software\\classes\\directory\\browseinplace', 0.000853),
 ('hkey_local_machine\\software\\classes\\folder\\docobject', 0.000853),
 ('hkey_local_machine\\software\\classes\\directory\\docobject', 0.000853),
 ('hkey_local_machine\\software\\classes\\folder\\browseinplace', 0.000853),
 ('hkey_local_machine\\software\\microsoft\\windows nt\\currentversion\\profilelist\\s-1-5-21-1608274813-2371932595-3444937299-1001',
  0.000735),
 ('hkey_local_machine\\software\\microsoft\\windows\\currentversion\\explorer\\folderdescriptions',
  0.000711),
 ('hkey_local_machine\\software\\wow6432node\\microsoft\\windows\\currentversion\\explorer\\folderdescriptions\\{5e6c858f-0e22-4760-

In [11]:
import pandas as pd
# write to dataframe
normal_df = pd.DataFrame(normal_query_weights, columns=['features', 'weights'])
normal_rank = range(1,len(normal_weights_dict)+1)
normal_df['rank']=normal_rank

In [12]:
# write to dataframe
ransom_df = pd.DataFrame(ransom_query_weights, columns=['features', 'weights'])
ransom_rank = range(1,len(ransom_weights_dict)+1)
ransom_df['rank'] = ransom_rank

In [48]:
ransom_df

Unnamed: 0,features,weights,rank
0,hkey_local_machine\software\microsoft\windows ...,0.001185,1
1,hkey_local_machine\software\classes\allfilesys...,0.000853,2
2,hkey_local_machine\software\classes\allfilesys...,0.000853,3
3,hkey_local_machine\software\classes\directory\...,0.000853,4
4,hkey_local_machine\software\classes\folder\doc...,0.000853,5
...,...,...,...
6447,hkey_current_user\software\sistemanet\lk4,0.000024,6448
6448,hkey_current_user\software\microsoft\windows\c...,0.000024,6449
6449,hkey_current_user\software\microsoft\windows\c...,0.000024,6450
6450,hkey_current_user\software\microsoft\windows\c...,0.000024,6451


In [24]:
ransom_df[ransom_df['weights']>0.0006]

Unnamed: 0,features,weights,rank
0,hkey_local_machine\software\microsoft\windows ...,0.001185,1
1,hkey_local_machine\software\classes\allfilesys...,0.000853,2
2,hkey_local_machine\software\classes\allfilesys...,0.000853,3
3,hkey_local_machine\software\classes\directory\...,0.000853,4
4,hkey_local_machine\software\classes\folder\doc...,0.000853,5
...,...,...,...
65,hkey_local_machine\software\wow6432node\micros...,0.000616,66
66,hkey_local_machine\software\wow6432node\micros...,0.000616,67
67,hkey_current_user\software\microsoft\windows\c...,0.000616,68
68,hkey_local_machine\software\wow6432node\micros...,0.000616,69


In [38]:
df1 = ransom_df[ransom_df['weights']>0.0007]
df1

Unnamed: 0,features,weights,rank
0,hkey_local_machine\software\microsoft\windows ...,0.001185,1
1,hkey_local_machine\software\classes\allfilesys...,0.000853,2
2,hkey_local_machine\software\classes\allfilesys...,0.000853,3
3,hkey_local_machine\software\classes\directory\...,0.000853,4
4,hkey_local_machine\software\classes\folder\doc...,0.000853,5
5,hkey_local_machine\software\classes\directory\...,0.000853,6
6,hkey_local_machine\software\classes\folder\bro...,0.000853,7
7,hkey_local_machine\software\microsoft\windows ...,0.000735,8
8,hkey_local_machine\software\microsoft\windows\...,0.000711,9
9,hkey_local_machine\software\wow6432node\micros...,0.000711,10


In [43]:
pd.concat([df1, normal_df], axis=1, join='inner')
feature_ranks = df1.merge(normal_df, on=['features'])
feature_ranks_com = feature_ranks[['features','rank_x','rank_y']]
feature_ranks_com

Unnamed: 0,features,rank_x,rank_y
0,hkey_local_machine\software\microsoft\windows ...,1,606
1,hkey_local_machine\software\classes\allfilesys...,2,4
2,hkey_local_machine\software\classes\allfilesys...,3,1
3,hkey_local_machine\software\classes\directory\...,4,2
4,hkey_local_machine\software\classes\folder\doc...,5,5
5,hkey_local_machine\software\classes\directory\...,6,3
6,hkey_local_machine\software\classes\folder\bro...,7,6
7,hkey_local_machine\software\microsoft\windows ...,8,26
8,hkey_local_machine\software\microsoft\windows\...,9,61
9,hkey_local_machine\software\wow6432node\micros...,10,67


In [44]:
feature_final = feature_ranks_com[feature_ranks_com['rank_y']>35]
feature_final

Unnamed: 0,features,rank_x,rank_y
0,hkey_local_machine\software\microsoft\windows ...,1,606
8,hkey_local_machine\software\microsoft\windows\...,9,61
9,hkey_local_machine\software\wow6432node\micros...,10,67
10,hkey_current_user\software\microsoft\windows\c...,11,68
11,hkey_current_user\software\microsoft\windows\c...,12,62
13,hkey_local_machine\software\wow6432node\micros...,14,63
14,hkey_local_machine\software\wow6432node\micros...,15,75
15,hkey_local_machine\software\wow6432node\micros...,16,79
16,hkey_local_machine\software\wow6432node\micros...,17,86
17,hkey_local_machine\software\wow6432node\micros...,18,91


In [47]:
feature_final.reset_index()

Unnamed: 0,index,features,rank_x,rank_y
0,0,hkey_local_machine\software\microsoft\windows ...,1,606
1,8,hkey_local_machine\software\microsoft\windows\...,9,61
2,9,hkey_local_machine\software\wow6432node\micros...,10,67
3,10,hkey_current_user\software\microsoft\windows\c...,11,68
4,11,hkey_current_user\software\microsoft\windows\c...,12,62
5,13,hkey_local_machine\software\wow6432node\micros...,14,63
6,14,hkey_local_machine\software\wow6432node\micros...,15,75
7,15,hkey_local_machine\software\wow6432node\micros...,16,79
8,16,hkey_local_machine\software\wow6432node\micros...,17,86
9,17,hkey_local_machine\software\wow6432node\micros...,18,91
