# Random Forest

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [3]:
data = pd.read_csv('malware_data.csv')

In [4]:
counts_family = data.groupby('Malware Family')['Malware Family'].transform(len)
mask = (counts_family > 50)
data = data[mask]

In [5]:
for index, row in data.iterrows():
    if len(row['OpCode Sequence'].split(' ')) < 200:
        data = data.drop(index)

In [6]:
len(data)

8028

In [7]:
import pickle
with open('word2vec_embeddings.pkl', 'rb') as fp:
    embed = pickle.load(fp)

word_to_idx = dict()
index = 0
for word in embed:
    word_to_idx[word] = index
    index = index + 1

In [8]:
X = []
for index, row in data.iterrows():
    seq = []
    for opcode in row['OpCode Sequence'].split(' '):
        try:
            seq.append(word_to_idx[opcode])
        except:
            print(opcode)
    seq = np.array(seq)
    X.append(seq)

In [9]:
X = np.array(X)

  X = np.array(X)


In [10]:
factor = pd.factorize(data['Malware Family'])
y = factor[0]
definitions = factor[1]

# Random Forest 100

In [16]:
X100 = []
for x in X:
    X100.append(x[0:100])

In [18]:
X100 = np.array(X100)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X100, y, test_size = 0.2, stratify = y, random_state = 21)

In [24]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [34]:
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators': [1, 10, 100], 'criterion': ('gini', 'entropy', 'log_loss'), 'max_features': ('sqrt', 'log2', None)}

rfc = RandomForestClassifier()
clf = GridSearchCV(rfc, parameters)

In [35]:
clf.fit(X_train, y_train)

45 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 450, in fit
    trees = Parallel(
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/opt/anaconda3/lib/p

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ('gini', 'entropy', 'log_loss'),
                         'max_features': ('sqrt', 'log2', None),
                         'n_estimators': [1, 10, 100]})

In [36]:
clf.score(X_test, y_test)

0.9695273631840796

# Random Forests 50

In [11]:
X50 = []
for x in X:
    X50.append(x[0:50])

In [12]:
X50 = np.array(X50)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X50, y, test_size = 0.2, stratify = y, random_state = 21)

In [14]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators': [1, 10, 100], 'criterion': ('gini', 'entropy', 'log_loss'), 'max_features': ('sqrt', 'log2', None)}

rfc = RandomForestClassifier()
clf = GridSearchCV(rfc, parameters)

In [16]:
clf.fit(X_train, y_train)

45 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 450, in fit
    trees = Parallel(
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/opt/anaconda3/lib/p

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ('gini', 'entropy', 'log_loss'),
                         'max_features': ('sqrt', 'log2', None),
                         'n_estimators': [1, 10, 100]})

In [17]:
clf.score(X_test, y_test)

0.9701678060907396

# Random Forests 150

In [11]:
X150 = []
for x in X:
    X150.append(x[0:150])

In [12]:
X150 = np.array(X150)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X150, y, test_size = 0.2, stratify = y, random_state = 21)

In [14]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators': [1, 10, 100], 'criterion': ('gini', 'entropy', 'log_loss'), 'max_features': ('sqrt', 'log2', None)}

rfc = RandomForestClassifier()
clf = GridSearchCV(rfc, parameters)

In [16]:
clf.fit(X_train, y_train)

45 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 450, in fit
    trees = Parallel(
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/opt/anaconda3/lib/p

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ('gini', 'entropy', 'log_loss'),
                         'max_features': ('sqrt', 'log2', None),
                         'n_estimators': [1, 10, 100]})

In [17]:
clf.score(X_test, y_test)

0.9695273631840796

# Random Forest 200

In [29]:
X200 = []
for x in X:
    X200.append(x[0:200])

In [30]:
X200 = np.array(X200)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X200, y, test_size = 0.2, stratify = y, random_state = 21)

In [32]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [33]:
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators': [1, 10, 100], 'criterion': ('gini', 'entropy', 'log_loss'), 'max_features': ('sqrt', 'log2', None)}

rfc = RandomForestClassifier()
clf = GridSearchCV(rfc, parameters)

In [34]:
clf.fit(X_train, y_train)

45 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 450, in fit
    trees = Parallel(
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/opt/anaconda3/lib/p

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ('gini', 'entropy', 'log_loss'),
                         'max_features': ('sqrt', 'log2', None),
                         'n_estimators': [1, 10, 100]})

In [35]:
clf.score(X_test, y_test)

0.9613947696139477

# SVM

In [14]:
from sklearn import svm

In [15]:
parameters = {'C': [0.5, 1, 5, 10], 'kernel': ('linear', 'poly', 'rbf', 'sigmoid'), 'degree': [2, 3, 4, 5]}

## SVM 100

In [11]:
X100 = []
for x in X:
    X100.append(x[0:100])

In [12]:
X100 = np.array(X100)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X100, y, test_size = 0.2, stratify = y, random_state = 21)

In [14]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(svm.SVC(), parameters)

In [19]:
clf.fit(X_train, y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.5, 1, 5, 10], 'degree': [2, 3, 4, 5],
                         'kernel': ('linear', 'poly', 'rbf', 'sigmoid')})

In [20]:
clf.score(X_test, y_test)

0.9633084577114428

In [21]:
clf.best_params_

{'C': 10, 'degree': 2, 'kernel': 'rbf'}

## SVM 50

In [11]:
X50 = []
for x in X:
    X50.append(x[0:50])

In [12]:
X50 = np.array(X50)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X50, y, test_size = 0.2, stratify = y, random_state = 21)

In [14]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(svm.SVC(), parameters)

In [19]:
clf.fit(X_train, y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.5, 1, 5, 10], 'degree': [2, 3, 4, 5],
                         'kernel': ('linear', 'poly', 'rbf', 'sigmoid')})

In [20]:
clf.score(X_test, y_test)

0.9651957737725295

## SVM 150

In [11]:
X150 = []
for x in X:
    X150.append(x[0:150])

In [12]:
X150 = np.array(X150)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X150, y, test_size = 0.2, stratify = y, random_state = 21)

In [14]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(svm.SVC(), parameters)

In [19]:
clf.fit(X_train, y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.5, 1, 5, 10], 'degree': [2, 3, 4, 5],
                         'kernel': ('linear', 'poly', 'rbf', 'sigmoid')})

In [20]:
clf.score(X_test, y_test)

0.9589552238805971

## SVM 200

In [11]:
X200 = []
for x in X:
    X200.append(x[0:200])

In [12]:
X200 = np.array(X200)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X200, y, test_size = 0.2, stratify = y, random_state = 21)

In [16]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(svm.SVC(), parameters)

In [18]:
clf.fit(X_train, y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.5, 1, 5, 10], 'degree': [2, 3, 4, 5],
                         'kernel': ('linear', 'poly', 'rbf', 'sigmoid')})

In [19]:
clf.score(X_test, y_test)

0.9551681195516812