In [0]:
import os
import pandas as pd
import re
import pickle
from joblib import dump, load

In [0]:
import nltk
# nltk.download('punkt') # Uncomment this line to download the nltk punkt resource.

In [0]:
from statistics import mean

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pprint

In [0]:
!pip install mifs

Import the data that is being used for the project from within google drive.

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
os.chdir('drive/My Drive/malware')
print(os.getcwd())

/content/drive/My Drive/malware


In [0]:
with open('all_analysis_data.txt', 'r') as original_data:
    api_calls = original_data.readlines()

In [0]:
class_labels = []
with open('labels.csv', 'r') as data_labels:
    for line in data_labels.readlines():
        class_labels.append(line.replace('\n', ''))

In [0]:
counts_vectorizer = CountVectorizer(ngram_range=(1,10), max_df=.97, min_df=.03)
malware_counts_ngram = counts_vectorizer.fit_transform(api_calls).todense()

In [0]:
features = counts_vectorizer.get_feature_names()

In [0]:
dump(features, 'features.joblib')

['features.joblib']

In [0]:
dump(malware_counts_ngram, 'counts_matrix.joblib')

['counts_matrix.joblib']

In [0]:
print(type(malware_counts_ngram))
print(malware_counts_ngram.shape)
print(len(features))
print(set(class_labels))

In [0]:
tfidf_transformer = TfidfTransformer()
tfidf_api_calls = tfidf_transformer.fit_transform(malware_counts_ngram)

In [0]:
dump(tfidf_api_calls, 'tfidf_vectors.joblib')

['tfidf_vectors.joblib']

In [0]:
tfidf_api_calls = load('tfidf_vectors.joblib')

In [0]:
benign_neg_dict = dict()
name_list = list()
for col_name, col_index in zip(features, range(malware_counts_ngram.shape[1])):
    name_list.append(col_name)
    col_data = malware_counts_ngram[:, col_index]
    trojan = 0
    worms = 0
    downloader = 0
    virus = 0
    backdoor = 0
    dropper = 0
    spyware = 0
    adware = 0
    num = 0
    for i in range(col_data.shape[0]):
        if col_data[i] == 1:
            num += 1
            if class_labels[i] == 'Trojan':
                trojan += 1
            elif class_labels == 'Worms':
                worms += 1
            elif class_labels == 'Downloader':
                downloader += 1
            elif class_labels == 'Virus':
                virus += 1
            elif class_labels == 'Backdoor':
                backdoor += 1
            elif class_labels == 'Spyware':
                spyware += 1
            elif class_labels == 'Dropper':
                dropper += 1
            elif class_labels == 'Adware':
                adware += 1
    if trojan == 0:
        percent_trojan = 0
    else:
        percent_trojan = float(trojan / num)
    if worms == 0:
        percent_worms = 0
    else:
        percent_worms = float(worms / num)
    if downloader == 0:
        percent_downloader = 0
    else:
        percent_downloader = float(downloader / num)  
    if virus == 0:
        percent_virus = 0
    else:
        percent_virus = float(virus / num) 
    if backdoor == 0:
        percent_backdoor = 0
    else:
        percent_backdoor = float(backdoor / num) 
    if dropper == 0:
        percent_dropper = 0
    else:
        percent_dropper = float(dropper / num) 
    if spyware == 0:
        percent_spyware = 0
    else:
        percent_spyware = float(spyware / num) 
    if adware == 0:
        percent_adware = 0
    else:
        percent_adware = float(adware / num) 
        
    term_frequency = num / len(class_labels)
    benign_neg_dict[col_name] = [trojan, worms, downloader, virus, backdoor, dropper, spyware, adware,
                                percent_trojan, percent_worms, percent_downloader, percent_virus, percent_backdoor,
                                percent_dropper, percent_spyware, percent_adware, num, term_frequency]

In [0]:
meta_data = pd.DataFrame(benign_neg_dict).T
meta_data['Api-Call'] = name_list
meta_data.columns = ['trojan', 'worms', 'downloader', 'virus', 'backdoor', 'dropper', 'spyware', 'adware',
                                'percent_trojan', 'percent_worms', 'percent_downloader', 'percent_virus', 'percent_backdoor',
                                'percent_dropper', 'percent_spyware', 'percent_adware', 'num', 'term_frequency', 'Api-Call']

In [0]:
def vocab_prune(meta, min_dist_away=.05):
    vocabulary_prune = set()
    for percent, name, i in zip(meta['positive_percent'].tolist(), meta['name'].tolist(), range(meta.shape[0])):
        percent = abs(percent - .5)
        if percent < min_dist_away:
            vocabulary_prune.add(name)
    return vocabulary_prune

Create some aggrogate fields for (not "classification label").  This is to find features that are specific to each of the malware classifications.

In [0]:
meta_data['not_trojan'] = meta_data['num'] - meta_data['trojan']
meta_data['not_worms'] = meta_data['num'] - meta_data['worms']
meta_data['not_downloader'] = meta_data['num'] - meta_data['downloader']
meta_data['not_virus'] = meta_data['num'] - meta_data['virus']
meta_data['not_backdoor'] = meta_data['num'] - meta_data['backdoor']
meta_data['not_spyware'] = meta_data['num'] - meta_data['dropper']
meta_data['not_dropper'] = meta_data['num'] - meta_data['spyware']
meta_data['not_adware'] = meta_data['num'] - meta_data['adware']

In [0]:
meta_data = pd.read_csv('meta_data_malwareAPI_10gram.csv')

In [0]:
print(meta_data.head())

In [0]:
meta_data.to_csv('meta_data_malwareAPI_10gram.csv')

We got to the new stuff huzzah.

In [0]:
x_train, x_test, y_train, y_test = train_test_split(tfidf_api_calls,
                                                    class_labels,
                                                    test_size=.20,
                                                    random_state = 44)

In [0]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=5, random_state=44),
    BernoulliNB(),
    MultinomialNB(),
    GaussianNB(),
    DecisionTreeClassifier(max_depth=10),
    AdaBoostClassifier(DecisionTreeClassifier(max_depth=10)),
    BaggingClassifier(DecisionTreeClassifier(max_depth=10)),
    KNeighborsClassifier(n_neighbors=5),
    MLPClassifier(solver='sgd', random_state=44)
]
set_class_labels = list(set(class_labels))
meta_name_list = ['total']
idf = None
name = 'Malware_API_calls'

In [0]:
meta = meta_data
        
count_vectorizer = CountVectorizer(ngram_range=(1,6), max_df=.97, min_df=.03)
counts_vectors = count_vectorizer.fit_transform(api_calls)
counts_features = count_vectorizer.get_feature_names()

with open(f'features_of_{name}_{str(idf)}', 'wb') as out_features:
    pickle.dump(counts_features, out_features)

tfidf_transformer = TfidfTransformer()
tfidf_vectors = tfidf_transformer.fit_transform(counts_vectors).todense()

with open(f'tfidf_vectors_{name}_{str(idf)}.pickle', 'wb') as out_matrix:
  pickle.dump(tfidf_vectors, out_matrix)

In [0]:
def not_class_label(class_label, class_label_list):
  return [label if label == class_label else 0 for label in class_label_list]

In [0]:
print(set(class_labels))

binary

In [0]:
with open(f'Accuracies_{name}_total.txt', 'w+') as out_file:
  out_file.write(f'Scores of tested sklearn classifiers.')
  # out_file.write(f'5 Fold Cross Validation Acc for Malware API Calls')
  # out_file.write(f'\nThese next models have all vocabulary\n')
  out_file.close()

run_number = 0
save_dict = {}

for label in set(class_labels):
    temp_labels = not_class_label(label, y_train)
    temp_y_test = not_class_label(label, y_test)

    for model in models:
        model_name = model.__class__.__name__

        model.fit(x_train.todense(), temp_labels)
        pred = model.predict(x_test.todense())

        summary = classification_report(temp_y_test, pred, output_dict=True)
        save_dict[str(run_number)] = summary
        run_number += 1

        with open(f'Accuracies_{name}_binary.txt', 'a+') as out_file:
            out_file.write(f'Binary class {label}\n')
            out_file.write(f'{model_name}\n')
            pprint.pprint(summary, stream=out_file)
            out_file.close()
        dump(model, f'{model_name}_{label}.joblib')

norms

In [0]:
for model in models:
    model_name = model.__class__.__name__

    model.fit(x_train.todense(), y_train)
    pred = model.predict(x_test.todense())

    summary = classification_report(y_test, pred, output_dict=True)
    save_dict[str(run_number)] = summary
    run_number += 1

    with open(f'Accuracies_{name}_total.txt', 'a+') as out_file:
        out_file.write(f'Classification on all labels.\n')
        out_file.write(f'{model_name}\n')
        pprint.pprint(summary, stream=out_file)
        out_file.close()
    dump(model, f'{model_name}.joblib')

dont use

In [0]:
with open(f'Accuracies_{name}.txt', 'w+') as out_file:
  out_file.write(f'Adaboost classification report')
  out_file.write(f'\nThese next models have all vocabulary\n')
  out_file.close()

for model in models:
  model_name = model.__class__.__name__

  model.fit(x_train.todense(), y_train)
  pred = model.predict(x_test.todense())

  summary = classification_report(y_test, pred, output_dict=True)

  with open(f'Accuracies_{name}.txt', 'a+') as out_file:
    out_file.write(f'Using data from {name}\n')
    out_file.write(f'{model_name}\n')
    pprint.pprint(summary, stream=out_file)
    out_file.close()

This was just for formatting of the files since i made a mistake.  I did not place these within a csv file.

In [0]:
with open('Accuracies.txt', 'r') as in_file:
  with open(f'Accuracies_{name}.txt', 'a') as out_file:
    for line in in_file.readlines():
      out_file.write(re.sub(r'U','\nU',line))

In [0]:
with open(f'Accuracies_{name}.txt', 'r') as in_file:
  with open('Accuracies.txt', 'a') as out_file:
    for line in in_file.readlines():
      if re.match(r'U', line):
        pass
      else:
        out_file.write(line)

In [0]:
with open(f'Accuracies.txt', 'r') as in_file:
  with open(f'Accuracies_{name}.txt', 'a+') as out_file:
    for line in in_file.readlines():
      if re.match(r'Mean', line):
        pass
      else:
          temp = line.split('    ')
          name = temp[0]
          acc_list = temp[1]

We are going to do some mRMR to do some feature selection of the ngrams.  Oh boy this is rough let's use mifs on your on computer and run the decision trees here instead.  Let's not talk about it

fuck

In [0]:
rand_forest = RandomForestClassifier(n_estimators=1000, max_depth=20, n_jobs=-1)

Let's make some confusion matrix and save them in a pics folder.  Also try to save them in csv and display as a heat map in tab.

In [0]:
from sklearn.metrics import plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve
import matplotlib.pyplot as plt
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
titles = [("CM_Normalized ", 'true'), ("CM", None)]
import csv
from sklearn.metrics import confusion_matrix

In [0]:
for model in models:
  model_name = model.__class__.__name__

  model = load(model_name + '.joblib')

  for title, normalize in titles:

    y_pred = model.predict(x_test)

    disp = confusion_matrix(y_test, y_pred,
                            labels = set_class_labels,
                            normalize = normalize)
    
    temp_df = pd.DataFrame(disp, index = set_class_labels, columns= set_class_labels)

    ax = sns.heatmap(temp_df, annot=True, linewidths=.05)
    
    plt.savefig('pics/' + model_name + '_' + title + '_sns.png')
    plt.close()


In [0]:
for model in models:
  model_name = model.__class__.__name__

  for title, normalize in titles:

    for class_name in set_class_labels:

      model = load(model_name + '_' + class_name + '.joblib')

      temp_labels = not_class_label(class_name, y_train)
      temp_y_test = not_class_label(class_name, y_test)

      disp = plot_confusion_matrix(model, x_test, temp_y_test,
                                  display_labels = set(temp_labels),
                                  cmap = plt.cm.Blues,
                                  normalize = normalize)
      
      disp.ax_.set_title(title + '_' + class_name)
      
      plt.savefig('te_pics/' + model_name + '_' + title + '_' + class_name +'.png')
      plt.close()

ROC curves

curve for each model for each binary

In [0]:
print(set_class_labels)

In [0]:
for model in models:

  model_name = model.__class__.__name__
  ax = None

  for class_num in range(len(set_class_labels)):
    model_t = load(model_name + '_' + set_class_labels[class_num] + '.joblib')

    disp = plot_roc_curve(model_t, x_test, y_test,
                          name = model_name + '_' + set_class_labels[class_num],
                          ax = ax)
    ax = plt.gca()

  plt.legend(fontsize = 'x-small')
  plt.savefig('ROC/models_labels_t/' + model_name + '_roc.png')
  plt.close()

In [0]:
for class_num in range(len(set_class_labels)):

  ax = None

  for model in models:
    model_name = model.__class__.__name__

    model_t = load(model_name + '_' + set_class_labels[class_num] + '.joblib')

    disp = plot_roc_curve(model_t, x_test, y_test,
                          name = model_name + '_' + set_class_labels[class_num],
                          ax = ax)
    ax = plt.gca()

  plt.legend(fontsize = 'x-small')
  plt.savefig('ROC/labels_models_t/' + set_class_labels[class_num] + '_roc.png')
  plt.close()


Let's make some percision recall curves now.

In [0]:
import os
import pandas as pd
import re
import pickle
from joblib import dump, load
import nltk
# nltk.download('punkt') # Uncomment this line to download the nltk punkt resource.
from statistics import mean

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pprint

os.chdir('drive/My Drive/malware')
#print(os.getcwd())

class_labels = []
with open('labels.csv', 'r') as data_labels:
    for line in data_labels.readlines():
        class_labels.append(line.replace('\n', ''))

tfidf_api_calls = load('tfidf_vectors.joblib')

def not_class_label(class_label, class_label_list):
  return [label if label == class_label else 0 for label in class_label_list]

x_train, x_test, y_train, y_test = train_test_split(tfidf_api_calls,
                                                    class_labels,
                                                    test_size=.20,
                                                    random_state = 44)

models = [
    #RandomForestClassifier(n_estimators=200, max_depth=5, random_state=44),
    #BernoulliNB(),
    #MultinomialNB(),
    #GaussianNB(),
    #CategoricalNB(),
    #DecisionTreeClassifier(max_depth=10),
    AdaBoostClassifier(DecisionTreeClassifier(max_depth=10)),
    #BaggingClassifier(DecisionTreeClassifier(max_depth=10)),
    #KNeighborsClassifier(n_neighbors=5),
    #MLPClassifier(solver='sgd', random_state=44)
]
set_class_labels = list(set(class_labels))
meta_name_list = ['total']
idf = None
name = 'Malware_API_calls'

from sklearn.metrics import plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
titles = [("CM_Normalized ", 'true'), ("CM", None)]
import csv

x_test = x_test.todense()

import seaborn as sns

  import pandas.util.testing as tm


In [0]:
#for class_num in range(len(set_class_labels)):

  for model in models:
    model_name = model.__class__.__name__

    model_t = load(model_name + '_' + set_class_labels[class_num] + '.joblib')

    lin = precision_recall_curve()

  plt.legend(fontsize = 'x-small')
  plt.savefig('PRCurve/labels_models/' + set_class_labels[class_num] + '_pr.png')
  plt.close()
  print(class_num)

In [0]:
for model in models:
  model_name = model.__class__.__name__

  for class_num in range(len(set_class_labels)):
    model_t = load(model_name + '_' + set_class_labels[class_num] + '.joblib')

    lin = plot_precision_recall_curve(model_t, x_test, y_test,
                          name = model_name + '_' + set_class_labels[class_num])

  plt.legend(fontsize = 'x-small')
  plt.savefig('PRCurve/models_labels/' + model_name + '_pr.png')
  plt.close()
  print(model)

summary of each model

col : model_name_id, class_used_for, model_name, summary information

In [0]:
summary_cols = ['id', 'class_label', 'model_name', 'acc',
                'weighted avg precision',
                'weighted avg recall',
                'weighted avg f1-score',
               'neg precision', 'neg recall', 'neg f1-score',
               'pos precision', 'pos recall', 'pos f1-score']

summary_list = []

for model in models:
  model_name = model.__class__.__name__

  for class_num in range(len(set_class_labels)):
    model_t = load(model_name + '_' + set_class_labels[class_num] + '.joblib')

    temp_labels = not_class_label(set_class_labels[class_num], y_train)
    temp_y_test = not_class_label(set_class_labels[class_num], y_test)

    pred = model_t.predict(x_test)
    summary = classification_report(temp_y_test, pred, output_dict=True)

    summary_list.append([model_name + '_' + set_class_labels[class_num],
                        set_class_labels[class_num],
                        model_name,
                        summary['accuracy'],
                        summary['weighted avg']['precision'],
                        summary['weighted avg']['recall'],
                        summary['weighted avg']['f1-score'],
                        summary['0']['precision'],
                        summary['0']['recall'],
                        summary['0']['f1-score'],
                        summary[set_class_labels[class_num]]['precision'],
                        summary[set_class_labels[class_num]]['recall'],
                        summary[set_class_labels[class_num]]['f1-score']
                        ])

summary_df = pd.DataFrame(summary_list, columns = summary_cols)


In [0]:
summary_cols = ['id', 'class_label', 'model_name', 'acc',
                'weighted avg precision',
                'weighted avg recall',
                'weighted avg f1-score',
               'neg precision', 'neg recall', 'neg f1-score',
               'pos precision', 'pos recall', 'pos f1-score']

summary_list = []

for model in models:
  model_name = model.__class__.__name__

  for class_num in range(len(set_class_labels)):
    model_t = load(model_name + '_' + set_class_labels[class_num] + '.joblib')

    temp_labels = not_class_label(set_class_labels[class_num], y_train)
    temp_y_test = not_class_label(set_class_labels[class_num], y_test)

    pred = model_t.predict(x_test)
    summary = classification_report(temp_y_test, pred, output_dict=True)

    summary_list.append([model_name + '_' + set_class_labels[class_num],
                        set_class_labels[class_num],
                        model_name,
                        summary['accuracy'],
                        summary['weighted avg']['precision'],
                        summary['weighted avg']['recall'],
                        summary['weighted avg']['f1-score'],
                        summary['0']['precision'],
                        summary['0']['recall'],
                        summary['0']['f1-score'],
                        summary[set_class_labels[class_num]]['precision'],
                        summary[set_class_labels[class_num]]['recall'],
                        summary[set_class_labels[class_num]]['f1-score']
                        ])

summary_df = pd.DataFrame(summary_list, columns = summary_cols)

In [0]:
summary_df.to_csv('summary_meta.csv')

In [0]:
summary_df = pd.read_csv('summary_meta.csv')

In [0]:
sns.heatmap(summary_df.iloc[:-1,:].T, annot = True)