# The realization of TF-IDF for sentences

**document will be composed of api callings and registry operations**

In [1]:
from nltk.corpus import stopwords
import pickle

In [2]:
# combine the title with the values
def splice_key_value(file):
    registry_values = []
    with open(file,'rb') as fr:
        registry_values_dict = pickle.load(fr, encoding="bytes")
    for key, values in registry_values_dict.items():
        for value in values:
            registry_values.append(key+'_'+str(value))
    return registry_values

In [3]:
registry_values_ransom_file = '../dataset/registry_values/registry_values_ransom.pkl'
registry_values_normal_file = '../dataset/registry_values/registry_values_normal.pkl'

registry_values_ransom_list = splice_key_value(registry_values_ransom_file)
registry_values_normal_list = splice_key_value(registry_values_normal_file)

In [4]:
len(registry_values_ransom_list)

50515

In [5]:
len(set(registry_values_ransom_list))

6657

In [6]:
registry_values_ransom_list[:5]

['regkey_opened_HKEY_LOCAL_MACHINE\\System\\CurrentControlSet\\Services\\DnsCache\\Parameters',
 'regkey_opened_HKEY_LOCAL_MACHINE\\Software\\Policies\\Microsoft\\Windows NT\\DnsClient',
 'regkey_opened_HKEY_LOCAL_MACHINE\\Software\\Microsoft\\Tracing',
 'regkey_opened_HKEY_LOCAL_MACHINE\\Software\\Policies\\Microsoft\\System\\DNSClient',
 'regkey_opened_HKEY_LOCAL_MACHINE\\System\\CurrentControlSet\\Services\\Tcpip\\Parameters']

# Term Frequency

In [7]:
import numpy as np

# function to convert string to lower case and delete the stop words in document
def preprocess_string(document):
    stop_words = set(stopwords.words('english'))
    # convert to lowercase
    registry_values_list = np.char.lower(document)
    # split sentences into words
    registry_words = []
    for registry_value in registry_values_list:
        words = registry_value.split("\\")
        for word in words:
            registry_words.append(word)
    # delete stop words in them
    for stop_word in stop_words:
        try:
            registry_words.remove(stop_word)
        except:
            continue
        finally:
            return registry_words


In [8]:
# here, we count how many times a word appears in the words array
# we do not use n-grams,cuz there are single string not only word
from collections import Counter

def countWords(words):
    '''
        return: a dictionary of {WORD: COUNT} where count is how many times
        that word appears in "words"
    '''
    words_dict ={}
    word_dict = Counter(words)
    return word_dict


In [9]:
# calculate the frequency of term in a document
def termFrequency(document):
    word_freq = {}
    # get the words without stop words
    registry_words = preprocess_string(document)
    # get the count for terms
    word_dict = countWords(registry_words)
    for key, value in word_dict.items():
        word_freq[key] = word_dict[key]/len(registry_words)
    return word_freq

calculate the TF for two datasets

In [10]:
word_freq_ransom = termFrequency(registry_values_ransom_list)
word_freq_normal = termFrequency(registry_values_normal_list)

In [11]:
# calculate the sentence importance
def senFrequency(sentence, word_freq):
    # split the sentence into words
    words_list = sentence.split('\\')
    words_list = np.char.lower(words_list)
    # log the frequency of sentences
    sen_freq_total = 0
    # log the number of non stop words in each process
    num = 0
    for word, freq in word_freq.items():
        if word in words_list:
            sen_freq_total += word_freq[word]
            num += 1
    sen_freq = np.round(sen_freq_total/num ,3)
    return sen_freq

In [12]:
# # generate the sen_freq dict with indexes as the keys
# sen_freq_ran_dict = {}
# for i, sentence in enumerate(registry_values_ransom_list): 
#     # input the lowercased sentence
    
#     sen_freq = senFrequency(sentence, word_freq_ransom)
#     sen_freq_ran_dict[str(i)] = sen_freq

In [None]:
# generate the sen_freq dict with strings as the keys
sen_freq_ran_dict = {}
sen_freq_nor_dict = {}
for sentence in registry_values_ransom_list: 
    # input the lowercase sentence
    sen_freq = senFrequency(sentence, word_freq_ransom)
    sen_freq_ran_dict[sentence] = sen_freq

for sentence in registry_values_normal_list: 
    # input the lowercase sentence
    sen_freq = senFrequency(sentence, word_freq_normal)
    sen_freq_nor_dict[sentence] = sen_freq

In [None]:
sen_freq_ran_dict

# Inverse document frequency

In [None]:
# IDF used over many documents, here, each sentence is its own document
# IDF = ln(number of docs/number docs the term appears in)
import math

def InverseDocumentFreq(document):
    # get all the unique words
    registry_words =  preprocess_string(document)
    # get the word dict with {word: occurences} in all documents
    word_dict = countWords(registry_words)
    # the number of document
    num_docs = len(document)
    # calculate the IDF for every words
    word_IDF = {}
    for word in registry_words:
        word_IDF[word] = np.round(math.log10(num_docs/word_dict[word]), 3)

    return word_IDF

calculate the IDF for two datasets

In [None]:
word_IDF_ran = InverseDocumentFreq(registry_values_ransom_list)
word_IDF_nor = InverseDocumentFreq(registry_values_normal_list)

In [None]:
word_IDF_ran

In [None]:
# calculate the IDF for single sentence
def SentenceIDF(word_IDF, sentence):
    # split the sentence into words
    words_list = sentence.split('\\')
    words_list = np.char.lower(words_list)
    # log the frequency of sentences
    sen_IDF_total = 0
    # log the number of non stop words in each process
    num = 0
    for word, IDF in word_IDF.items():
        if word in words_list:
            sen_IDF_total += word_IDF[word]
            num += 1
    sen_IDF = np.round(sen_IDF_total/num ,3)
    return sen_IDF

calculate the IDF for all sentences in ran dataset:

In [None]:
sen_IDF_ran_dict = {}
sen_IDF_nor_dict = {}
for sentence in registry_values_ransom_list:
    sen_IDF = SentenceIDF(word_IDF_ran, sentence)
    sen_IDF_ran_dict[sentence] = sen_IDF

for sentence in registry_values_normal_list:
    sen_IDF = SentenceIDF(word_IDF_nor, sentence)
    sen_IDF_nor_dict[sentence] = sen_IDF


In [None]:
# calculate the TF_IDF for sentences
def TF_IDF(sen_TF_dict, sen_IDF_dict):
    # two dict should have the same keys
    sen_TF_IDF = {}
    for key in sen_TF_dict.keys():
        sen_TF_IDF[key] =
        sen_TF_dict[key] * sen_IDF_dict[key]
        
    return sen_TF_IDF

calculate the sentence importance for two datasets

In [None]:
sen_ran_TF_IDF = TF_IDF(sen_freq_ran_dict, sen_IDF_ran_dict)
sen_nor_TF_IDF = TF_IDF(sen_freq_nor_dict, sen_IDF_nor_dict)

In [None]:
sen_ran_TF_IDF 

compare the importance ranking for two datasets

In [None]:
# create the rank dataframe for ransomware dataset
sen_ran_df = pd.Series(sen_ran_TF_IDF.keys()).reset_index()
sen_ran_df.columns = ['Index','Features']
sen_ran_df['importance'] = pd.Series(sen_ran_TF_IDF.values())
sen_ran_df = sen_ran_df.sort_values(by=['importance'], ascending=False)
sen_ran_df['rank'] = range(1,len(sen_ran_df)+1)

In [None]:
# create the rank dataframe for normal dataset
sen_nor_df = pd.Series(sen_nor_TF_IDF.keys()).reset_index()
sen_nor_df.columns = ['Index', 'Features']
sen_nor_df['importance'] = pd.Series(sen_nor_TF_IDF.values())
sen_nor_df = sen_nor_df.sort_values(by=['importance'], ascending=False)
sen_nor_df['rank'] = range(1, len(sen_nor_df)+1)

Merge two dataframe and compare the feature ranks

In [None]:
feature_ranks = sen_ran_df.merge(sen_nor_df, on=['features'])
feature_ranks_com = feature_ranks[['features','rank_x','rank_y']]
feature_ranks_com

# Reference：
**Automated Behavioral Analysis of Malware A Case Study of WannaCry Ransomware**