In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('malware_data.csv')

In [3]:
counts_family = data.groupby('Malware Family')['Malware Family'].transform(len)
mask = (counts_family > 50)
data = data[mask]

In [4]:
for index, row in data.iterrows():
    if len(row['OpCode Sequence'].split(' ')) < 100:
        data = data.drop(index)

In [5]:
len(data)

8040

In [7]:
import pickle
with open('cell_state_dict.pkl', 'rb') as fp:
    cell_states = pickle.load(fp)

In [8]:
cell_states_list = []
for key in cell_states:
    cell_states_list.append(cell_states[key])

In [11]:
X = np.array(cell_states_list)

In [12]:
X.shape

(8040, 100)

In [14]:
factor = pd.factorize(data['Malware Family'])
y = factor[0]
definitions = factor[1]

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 21)

In [23]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 42)

In [27]:
hist = classifier.fit(X_train, y_train)

In [42]:
classifier.score(X_test, y_test)

0.9956467661691543

In [48]:
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators': [1, 10, 100], 'criterion': ('gini', 'entropy', 'log_loss'), 'max_features': ('sqrt', 'log2', None)}

rfc = RandomForestClassifier()
clf = GridSearchCV(rfc, parameters)

In [49]:
clf.fit(X_train, y_train)

45 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 450, in fit
    trees = Parallel(
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/opt/anaconda3/lib/p

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ('gini', 'entropy', 'log_loss'),
                         'max_features': ('sqrt', 'log2', None),
                         'n_estimators': [1, 10, 100]})

In [50]:
clf.score(X_test, y_test)

0.9975124378109452

In [52]:
clf.best_params_

{'criterion': 'gini', 'max_features': None, 'n_estimators': 100}

## Test with Cross Validation

In [53]:
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(criterion = 'gini', max_features = None, n_estimators = 100)

scores = cross_val_score(clf, X, y, cv=5)

In [55]:
scores.mean()

0.9975124378109452