In [0]:
import os
import pandas as pd
import re
import pickle
from joblib import dump, load
import nltk
# nltk.download('punkt') # Uncomment this line to download the nltk punkt resource.
from statistics import mean

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import auc
import pprint

os.chdir('drive/My Drive/malware')
#print(os.getcwd())

class_labels = []
with open('labels.csv', 'r') as data_labels:
    for line in data_labels.readlines():
        class_labels.append(line.replace('\n', ''))

tfidf_api_calls = load('tfidf_vectors.joblib')

x_train, x_test, y_train, y_test = train_test_split(tfidf_api_calls,
                                                    class_labels,
                                                    test_size=.20,
                                                    random_state = 44)

models = [
    RandomForestClassifier(n_estimators=200, max_depth=5, random_state=44),
    BernoulliNB(),
    MultinomialNB(),
    GaussianNB(),
    DecisionTreeClassifier(max_depth=10),
    AdaBoostClassifier(DecisionTreeClassifier(max_depth=10)),
    BaggingClassifier(DecisionTreeClassifier(max_depth=10)),
    KNeighborsClassifier(n_neighbors=5),
    MLPClassifier(solver='sgd', random_state=44)
]
set_class_labels = list(set(class_labels))
meta_name_list = ['total']
idf = None
name = 'Malware_API_calls'

from sklearn.metrics import plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
titles = [("CM_Normalized ", 'true'), ("CM", None)]
import csv

x_test = x_test.todense()

def not_class_label(class_label, class_label_list):
  return [label if label == class_label else 0 for label in class_label_list]

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
for class_num in range(len(set_class_labels)):

  ax = None

  for model in models:
    model_name = model.__class__.__name__

    model_t = load(model_name + '_' + set_class_labels[class_num] + '.joblib')

    disp = plot_roc_curve(model_t, x_test, y_test,
                          name = model_name + '_' + set_class_labels[class_num],
                          ax = ax)
    ax = plt.gca()

  plt.legend(fontsize = 'x-small')
  plt.savefig('ROC/labels_models_t/' + set_class_labels[class_num] + '_roc.png')
  plt.close()
  print(class_num)

In [0]:
for class_num in range(len(set_class_labels)):

  ax = None 

  for model in models:
    model_name = model.__class__.__name__

    model_t = load(model_name + '_' + set_class_labels[class_num] + '.joblib')

    temp_labels = not_class_label(set_class_labels[class_num], y_train)
    temp_y_test = not_class_label(set_class_labels[class_num], y_test)

    lin = plot_precision_recall_curve(model_t, x_test, temp_y_test,
                          name = model_name + '_' + set_class_labels[class_num],
                          ax = ax)
    ax = plt.gca()

  plt.legend(fontsize = 'xx-small')
  plt.savefig('PRCurve/labels_models/' + set_class_labels[class_num] + '_pr.png')
  plt.close()
  print(class_num)

0
1
2
3
4
5
6
7


In [0]:
for model in models:
  model_name = model.__class__.__name__

  ax = None

  for class_num in range(len(set_class_labels)):
    model_t = load(model_name + '_' + set_class_labels[class_num] + '.joblib')

    temp_labels = not_class_label(set_class_labels[class_num], y_train)
    temp_y_test = not_class_label(set_class_labels[class_num], y_test)

    lin = plot_precision_recall_curve(model_t, x_test, temp_y_test,
                          name = model_name + '_' + set_class_labels[class_num],
                          ax = ax)
    ax = plt.gca()

  plt.legend(fontsize = 'xx-small')
  plt.savefig('PRCurve/models_labels/' + model_name + '_pr.png')
  plt.close()
  print(model)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=44, verbose=0,
                       warm_start=False)
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
GaussianNB(priors=None, var_smoothing=1e-09)
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_s