In [1]:
import os

import pandas as pd
import pickle

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import PCA
from sklearn.manifold import isomap, mds, t_sne

In [2]:
os.chdir('F:\Data\Malware')
print(os.getcwd())

F:\Data\Malware


In [3]:
with open('Malware-API/all_analysis_data.txt', 'r') as original_data:
    api_calls = original_data.readlines()

In [None]:
print(api_calls)

In [4]:
labels = []
with open('Malware-API/labels.csv', 'r') as data_labels:
    for line in data_labels.readlines():
        labels.append(line.replace('\n', ''))

In [5]:
train_api_calls, test_api_calls, train_labels, test_labels = train_test_split(api_calls, labels, test_size=.3, random_state=44)

In [8]:
tfidf_vect_total = TfidfVectorizer(ngram_range=(1, 3))
tfidf_apicalls = tfidf_vect_total.fit_transform(api_calls).todense()

# One  n-grams created as file  'Malware-API/tfidf-no-features-ngrams.csv'
# Bi   n-grams created as file  'Malware-API/tfidf-bi-features-ngrams.csv'
# Tri  n-grams created as file  'Malware-API/tfidf-tri-features-ngrams.csv'
tfidf_total_dataframe = pd.DataFrame(tfidf_apicalls, columns=tfidf_vect_total.get_feature_names())
tfidf_total_dataframe.to_csv('Malware-API/tfidf-tri-features-ngrams.csv')

In [9]:
tfidf_total_no_dataframe = pd.read_csv('Malware-API/tfidf-no-features-ngrams.csv')
tfidf_total_bi_dataframe = pd.read_csv('Malware-API/tfidf-bi-features-ngrams.csv')
tfidf_total_tri_dataframe = pd.read_csv('Malware-API/tfidf-tri-features-ngrams.csv')

KeyboardInterrupt: 

In [None]:
tfidf_vect_split = TfidfVectorizer(max_features=300)
tfidf_train_apicalls = tfidf_vect_split.fit_transform(train_api_calls).todense()
tfidf_test_apicalls = tfidf_vect_split.transform(test_api_calls).todense()

print(tfidf_vect_split.get_feature_names())
print(len(tfidf_vect_split.get_feature_names()))
print(tfidf_train_apicalls)

In [None]:
perplexity_array = [5, 10, 15, 20, 25, 30, 35, 40, 45]
max_iter = 10000
random_state = 44
embeddings = []
for perp in perplexity_array:
    tsne = TSNE(perplexity=perp, n_iter=max_iter, random_state=random_state)
    xy_coordinates = tsne.fit_transform(tfidf_train_apicalls, y=train_labels)
    embeddings.append(xy_coordinates)

In [None]:
def tsne_plot(ax, x, y, perplexity, c=None):
    ax.scatter(x, y, c=None)
    ax.set_title('' + perplexity + ' perplexity', fontsize=8)
    # print('make plots')

In [None]:
# used this url as a template
# https://stackoverflow.com/questions/33246065/convert-categorical-variable-to-color-with-matplotlib
def category_to_color(y_classification):
    n = len(y_classification)
    cmap = cm.get_cmap('hsv')
    return [cmap(float(i)/n) for i in range(n)]

In [None]:
classifier_color = category_to_color(train_labels)

In [None]:
fig, axs = plt.subplots(nrows=9, ncols=9)
for embedding, ax, perp in zip(embeddings, axs.flat, perplexity_array):
    x = embedding[0]
    y = embedding[1]
    if classifier_color is None:
        tsne_plot(ax, x, y, perplexity=perp)
    else:
        tsne_plot(ax, x, y, perplexity=perp, c=classifier_color)

In [None]:
tfidf_training_dataframe = pd.DataFrame([tfidf_train_apicalls, train_labels], columns=['api_calls', 'labels'])
tfidf_test_dataframe = pd.DataFrame([tfidf_test_apicalls, test_labels], columns=['api_calls', 'labels'])

In [None]:
pd.to_pickle(tfidf_training_dataframe, 'Malware-API/training-dataframe.pickle')
pd.to_pickle(tfidf_test_dataframe, 'Malware-API/test-dataframe.pickle')

In [None]:
tfidf_training_dataframe = pd.read_pickle('Malware-API/training-dataframe.pickle')
tfidf_test_dataframe = pd.read_pickle('Malware-API/test-dataframe.pickle')