# The realization of TF-IDF for sentences

**document will be composed of api callings and registry operations**

In [1]:
from nltk.corpus import stopwords
import pickle

In [2]:
# combine the title with the values
def splice_key_value(file):
    registry_values = []
    with open(file,'rb') as fr:
        registry_values_dict = pickle.load(fr, encoding="bytes")
    for key, values in registry_values_dict.items():
        for value in values:
            registry_values.append(key+'_'+str(value))
    return registry_values

In [3]:
registry_values_ransom_file = '../dataset/registry_values/registry_values_ransom.pkl'
registry_values_normal_file = '../dataset/registry_values/registry_values_normal.pkl'

registry_values_ransom_list = splice_key_value(registry_values_ransom_file)
registry_values_normal_list = splice_key_value(registry_values_normal_file)

In [4]:
len(registry_values_ransom_list)

50515

In [5]:
len(set(registry_values_ransom_list))

6657

In [6]:
registry_values_ransom_list[:5]

['regkey_opened_HKEY_LOCAL_MACHINE\\System\\CurrentControlSet\\Services\\DnsCache\\Parameters',
 'regkey_opened_HKEY_LOCAL_MACHINE\\Software\\Policies\\Microsoft\\Windows NT\\DnsClient',
 'regkey_opened_HKEY_LOCAL_MACHINE\\Software\\Microsoft\\Tracing',
 'regkey_opened_HKEY_LOCAL_MACHINE\\Software\\Policies\\Microsoft\\System\\DNSClient',
 'regkey_opened_HKEY_LOCAL_MACHINE\\System\\CurrentControlSet\\Services\\Tcpip\\Parameters']

# Term Frequency

In [7]:
import numpy as np

# function to convert string to lower case and delete the stop words in document
def preprocess_string(document):
    stop_words = set(stopwords.words('english'))
    # convert to lowercase
    registry_values_list = np.char.lower(document)
    # split sentences into words
    registry_words = []
    for registry_value in registry_values_list:
        words = registry_value.split("\\")
        for word in words:
            registry_words.append(word)
    # delete stop words in them
    for stop_word in stop_words:
        try:
            registry_words.remove(stop_word)
        except:
            continue
        finally:
            return registry_words


In [8]:
# here, we count how many times a word appears in the words array
# we do not use n-grams,cuz there are single string not only word
from collections import Counter

def countWords(words):
    '''
        return: a dictionary of {WORD: COUNT} where count is how many times
        that word appears in "words"
    '''
    words_dict ={}
    word_dict = Counter(words)
    return word_dict


In [9]:
# calculate the frequency of term in a document
def termFrequency(document):
    word_freq = {}
    # get the words without stop words
    registry_words = preprocess_string(document)
    # get the count for terms
    word_dict = countWords(registry_words)
    for key, value in word_dict.items():
        word_freq[key] = word_dict[key]/len(registry_words)
    return word_freq

In [10]:
word_freq_ransom = termFrequency(registry_values_ransom_list)

In [11]:
# calculate the sentence importance
def senFrequency(sentence, word_freq):
    # split the sentence into words
    words_list = sentence.split('\\')
    words_list = np.char.lower(words_list)
    # log the frequency of sentences
    sen_freq_total = 0
    # log the number of non stop words in each process
    num = 0
    for word, freq in word_freq.items():
        if word in words_list:
            sen_freq_total += word_freq[word]
            num += 1
    sen_freq = np.round(sen_freq_total/num ,3)
    return sen_freq

In [12]:
# # generate the sen_freq dict with indexes as the keys
# sen_freq_ran_dict = {}
# for i, sentence in enumerate(registry_values_ransom_list): 
#     # input the lowercased sentence
    
#     sen_freq = senFrequency(sentence, word_freq_ransom)
#     sen_freq_ran_dict[str(i)] = sen_freq

In [13]:
# generate the sen_freq dict with strings as the keys
sen_freq_ran_dict = {}
for sentence in registry_values_ransom_list: 
    # input the lowercase sentence
    sen_freq = senFrequency(sentence, word_freq_ransom)
    sen_freq_ran_dict[sentence] = sen_freq

In [14]:
sen_freq_ran_dict

{'regkey_opened_HKEY_LOCAL_MACHINE\\System\\CurrentControlSet\\Services\\DnsCache\\Parameters': 0.004,
 'regkey_opened_HKEY_LOCAL_MACHINE\\Software\\Policies\\Microsoft\\Windows NT\\DnsClient': 0.037,
 'regkey_opened_HKEY_LOCAL_MACHINE\\Software\\Microsoft\\Tracing': 0.055,
 'regkey_opened_HKEY_LOCAL_MACHINE\\Software\\Policies\\Microsoft\\System\\DNSClient': 0.038,
 'regkey_opened_HKEY_LOCAL_MACHINE\\System\\CurrentControlSet\\Services\\Tcpip\\Parameters': 0.004,
 'regkey_opened_HKEY_LOCAL_MACHINE\\Software\\Microsoft\\Tracing\\RASMANCS': 0.044,
 'regkey_opened_HKEY_LOCAL_MACHINE\\SOFTWARE\\Wow6432Node\\Microsoft\\Windows\\CurrentVersion\\explorer\\FolderDescriptions\\{FDD39AD0-238F-46AF-ADB4-6C85480369C7}\\PropertyBag': 0.061,
 'regkey_opened_HKEY_LOCAL_MACHINE\\SOFTWARE\\Wow6432Node\\Microsoft\\Windows\\CurrentVersion\\explorer\\FolderDescriptions\\{E555AB60-153B-4D17-9F04-A5FE99FC15EC}': 0.067,
 'regkey_opened_HKEY_LOCAL_MACHINE\\Software\\Microsoft\\Windows\\CurrentVersion\\Explor

# Inverse document frequency

In [20]:
# IDF used over many documents, here, each sentence is its own document
# IDF = ln(number of docs/number docs the term appears in)
import math

def InverseDocumentFreq(document):
    # get all the unique words
    registry_words =  preprocess_string(document)
    # get the word dict with {word: occurences} in all documents
    word_dict = countWords(registry_words)
    # the number of document
    num_docs = len(document)
    # calculate the IDF for every words
    word_IDF = {}
    for word in registry_words:
        word_IDF[word] = np.round(math.log10(num_docs/word_dict[word]), 3)

    return word_IDF

In [21]:
word_IDF_ran = InverseDocumentFreq(registry_values_ransom_list)

In [22]:
word_IDF_ran

{'regkey_opened_hkey_local_machine': 0.803,
 'system': 1.692,
 'currentcontrolset': 2.47,
 'services': 2.019,
 'dnscache': 3.473,
 'parameters': 2.363,
 'software': 0.022,
 'policies': 1.563,
 'microsoft': 0.061,
 'windows nt': 2.055,
 'dnsclient': 3.172,
 'tracing': 2.171,
 'tcpip': 2.891,
 'rasmancs': 2.439,
 'wow6432node': 0.153,
 'windows': 0.129,
 'currentversion': 0.131,
 'explorer': 0.173,
 'folderdescriptions': 0.215,
 '{fdd39ad0-238f-46af-adb4-6c85480369c7}': 2.019,
 'propertybag': 1.555,
 '{e555ab60-153b-4d17-9f04-a5fe99fc15ec}': 2.407,
 'usersfiles': 2.548,
 'namespace': 2.524,
 '{6f0cd92b-2e97-45d1-88ff-b0d186b8dedd}': 2.407,
 '{c4900540-2379-4c75-844b-64e6faf8716b}': 2.407,
 '{a302545d-deff-464b-abe8-61c8648d939b}': 2.407,
 '{b88f4daa-e7bd-49a9-b74d-02885a5dc765}': 2.407,
 'knownfoldersettings': 3.361,
 'regkey_opened_hkey_classes_root': 1.788,
 'directory': 2.215,
 '{724ef170-a42d-4fef-9f26-b60e846fba4f}': 2.407,
 '{a4115719-d62e-491d-aa7c-e74b8be3b067}': 2.407,
 '{7c5a40

In [23]:
# calculate the IDF for single sentence
def SentenceIDF(word_IDF, sentence):
    # split the sentence into words
    words_list = sentence.split('\\')
    words_list = np.char.lower(words_list)
    # log the frequency of sentences
    sen_IDF_total = 0
    # log the number of non stop words in each process
    num = 0
    for word, IDF in word_IDF.items():
        if word in words_list:
            sen_IDF_total += word_IDF[word]
            num += 1
    sen_IDF = np.round(sen_IDF_total/num ,3)
    return sen_IDF

In [24]:
# calculate the IDF for all sentences in ran dataset:
sen_IDF_ran_dict ={}
for sentence in registry_values_ransom_list:
    sen_IDF = SentenceIDF(word_IDF_ran, sentence)
    sen_IDF_ran_dict[sentence] = sen_IDF



In [26]:
# calculate the TF_IDF for sentences
def TF_IDF(sen_TF_dict, sen_IDF_dict):
    # two dict should have the same keys
    sen_TF_IDF = {}
    for key in sen_TF_dict.keys():
        sen_TF_IDF[key] = sen_TF_dict[key] * sen_IDF_dict[key]
        
    return sen_TF_IDF

In [27]:
sen_TF_IDF = TF_IDF(sen_freq_ran_dict, sen_IDF_ran_dict)

In [28]:
sen_TF_IDF 

{'regkey_opened_HKEY_LOCAL_MACHINE\\System\\CurrentControlSet\\Services\\DnsCache\\Parameters': 0.008548,
 'regkey_opened_HKEY_LOCAL_MACHINE\\Software\\Policies\\Microsoft\\Windows NT\\DnsClient': 0.047323,
 'regkey_opened_HKEY_LOCAL_MACHINE\\Software\\Microsoft\\Tracing': 0.04202,
 'regkey_opened_HKEY_LOCAL_MACHINE\\Software\\Policies\\Microsoft\\System\\DNSClient': 0.046322,
 'regkey_opened_HKEY_LOCAL_MACHINE\\System\\CurrentControlSet\\Services\\Tcpip\\Parameters': 0.00816,
 'regkey_opened_HKEY_LOCAL_MACHINE\\Software\\Microsoft\\Tracing\\RASMANCS': 0.048355999999999996,
 'regkey_opened_HKEY_LOCAL_MACHINE\\SOFTWARE\\Wow6432Node\\Microsoft\\Windows\\CurrentVersion\\explorer\\FolderDescriptions\\{FDD39AD0-238F-46AF-ADB4-6C85480369C7}\\PropertyBag': 0.032086,
 'regkey_opened_HKEY_LOCAL_MACHINE\\SOFTWARE\\Wow6432Node\\Microsoft\\Windows\\CurrentVersion\\explorer\\FolderDescriptions\\{E555AB60-153B-4D17-9F04-A5FE99FC15EC}': 0.030485,
 'regkey_opened_HKEY_LOCAL_MACHINE\\Software\\Microsof

# Reference：
**Automated Behavioral Analysis of Malware A Case Study of WannaCry Ransomware**