In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
from statistics import mean

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier

In [6]:
import pickle
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

In [2]:
import mifs

In [4]:
os.chdir('F:\Data\Malware\MalwareAnalysisAndData\Malware-API')

In [6]:
with open('all_analysis_data.txt', 'r') as original_data:
    api_calls = original_data.readlines()

In [7]:
class_labels = []
with open('labels.csv', 'r') as data_labels:
    for line in data_labels.readlines():
        class_labels.append(line.replace('\n', ''))

In [5]:
counts_vectorizer = CountVectorizer(ngram_range=(1,10), max_df=.97, min_df=.03)
malware_counts_ngram = counts_vectorizer.fit_transform(api_calls).todense()

In [6]:
features = counts_vectorizer.get_feature_names()

In [7]:
print(type(malware_counts_ngram))
print(malware_counts_ngram.shape)
print(len(features))
print(set(class_labels))

<class 'numpy.matrix'>
(7107, 14666)
14666
{'Adware', 'Backdoor', 'Virus', 'Spyware', 'Trojan', 'Dropper', 'Worms', 'Downloader'}


In [None]:
benign_neg_dict = dict()
name_list = list()
for col_name, col_index in zip(features, range(malware_counts_ngram.shape[1])):
    name_list.append(col_name)
    col_data = malware_counts_ngram[:, col_index]
    trojan = 0
    worms = 0
    downloader = 0
    virus = 0
    backdoor = 0
    dropper = 0
    spyware = 0
    adware = 0
    num = 0
    for i in range(col_data.shape[0]):
        if col_data[i] == 1:
            num += 1
            if class_labels[i] == 'Trojan':
                trojan += 1
            elif class_labels == 'Worms':
                worms += 1
            elif class_labels == 'Downloader':
                downloader += 1
            elif class_labels == 'Virus':
                virus += 1
            elif class_labels == 'Backdoor':
                backdoor += 1
            elif class_labels == 'Spyware':
                spyware += 1
            elif class_labels == 'Dropper':
                dropper += 1
            elif class_labels == 'Adware':
                adware += 1
    if trojan == 0:
        percent_trojan = 0
    else:
        percent_trojan = float(trojan / num)
    if worms == 0:
        percent_worms = 0
    else:
        percent_worms = float(worms / num)
    if downloader == 0:
        percent_downloader = 0
    else:
        percent_downloader = float(downloader / num)  
    if virus == 0:
        percent_virus = 0
    else:
        percent_virus = float(virus / num) 
    if backdoor == 0:
        percent_backdoor = 0
    else:
        percent_backdoor = float(backdoor / num) 
    if dropper == 0:
        percent_dropper = 0
    else:
        percent_dropper = float(dropper / num) 
    if spyware == 0:
        percent_spyware = 0
    else:
        percent_spyware = float(spyware / num) 
    if adware == 0:
        percent_adware = 0
    else:
        percent_adware = float(adware / num) 
        
    term_frequency = num / len(class_labels)
    benign_neg_dict[col_name] = [trojan, worms, downloader, virus, backdoor, dropper, spyware, adware,
                                percent_trojan, percent_worms, percent_downloader, percent_virus, percent_backdoor,
                                percent_dropper, percent_spyware, percent_adware, num, term_frequency]

In [12]:
meta_data = pd.DataFrame(benign_neg_dict).T
meta_data['Api-Call'] = name_list
meta_data.columns = ['trojan', 'worms', 'downloader', 'virus', 'backdoor', 'dropper', 'spyware', 'adware',
                                'percent_trojan', 'percent_worms', 'percent_downloader', 'percent_virus', 'percent_backdoor',
                                'percent_dropper', 'percent_spyware', 'percent_adware', 'num', 'term_frequency', 'Api-Call']

In [8]:
tfidf_transformer = TfidfTransformer()
tfidf_api_calls = tfidf_transformer.fit_transform(malware_counts_ngram)

tfidfvectorizer for inverse document frequency, and quick way to make the matrix again for classification use.

In [8]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,10), max_df=.97, min_df=.03)
tfidf_api_calls = tfidf_vectorizer.fit_transform(api_calls)

In [None]:
meta_data['idf'] = tfidf_vectorizer.idf_

In [15]:
meta_data.to_csv('meta_data_malwareAPI_10gram.csv')

In [None]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=5, random_state=44),
    BernoulliNB(),
    MultinomialNB(),
    GaussianNB(),
    DecisionTreeClassifier(max_depth=10),
    AdaBoostClassifier(DecisionTreeClassifier(max_depth=10)),
    BaggingClassifier(DecisionTreeClassifier(max_depth=10)),
    KNeighborsClassifier(n_neighbors=5),
    MLPClassifier(solver='sgd', random_state=44),
]

meta_name_list = ['total']
idfs = [None]
name = 'Malware_API_calls'

In [None]:
for idf in idfs:

    if idf == 'total':
        meta = meta_data
    else:
        meta = meta_data.loc[meta_data['idf'] > idf]
        
    count_vectorizer = CountVectorizer(ngram_range=(1,10), max_df=.97, min_df=.03)
    counts_vectors = count_vectorizer.fit_transform(api_calls)
    counts_features = count_vectorizer.get_feature_names()
    
    with open(f'features_of_{name}_{str(idf)}', 'wb') as out_features:
        pickle.dump(counts_features, out_features)

    tfidf_transformer = TfidfTransformer()
    tfidf_vectors = tfidf_transformer.fit_transform(counts_vectors).todense()
    if idf == 'total':
      meta_data['idf'] = tfidf_transformer.idf_

    with open(f'tfidf_vectors_{name}_{str(idf)}.pickle', 'wb') as out_matrix:
      pickle.dump(tfidf_vectors, out_matrix)

In [None]:
dataframe = pd.DataFrame()
model_names = []
idxs = [[],[],[],[],[]]

In [None]:
for idf, name_meta in zip(idfs, meta_name_list):
    with open(f'tfidf_vectors_{name}_{str(name_meta)}.pickle', 'rb') as in_matrix:
        tfidf_vectors = pickle.load(in_matrix)

    with open(f'Accuracies_{name}.txt', 'w+') as out_file:
        if idf is None:
            out_file.write(f'5 Fold Cross Validation Acc for Malware API Calls')
            out_file.write(f'\nThese next models have all vocabulary\n')
            out_file.close()
        else:
            out_file.write(f'\nThese next models have idf > {idf}\n')
            out_file.close()

    for model in models:
        model_name = model.__class__.__name__
        accuracies = cross_val_score(model, tfidf_vectors, y=class_labels,
                                      scoring='accuracy', cv=5, n_jobs=-1)
        with open(f'Accuracies_{name}.txt', 'a+') as out_file:
            out_file.write(f'Using data from {name}\n')
            out_file.write(model_name)
            out_file.write(f'\n{accuracies}\n')
            out_file.close()

We have a way to run the classifiers now but the features that we are using are bad.
Feature selection and extraction process shall now begin.

In [38]:
# DECISION TREE AND LOOK
# RANDOM FOREST AND LOOK
# MRMR and recursive feature selection using decision tree. 

DT

In [11]:
with open('DecisionTreeClassifier_total.pickle', 'rb') as read_file:
    clf = pickle.load(read_file)
    
plt.figure()
plot_tree(clf, filled=True)
plt.show()

ModuleNotFoundError: No module named 'sklearn.tree._classes'