In [1]:
import os
import sklearn.model_selection
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import MinMaxScaler
from itertools import islice
from sagemaker.predictor import csv_serializer
%matplotlib inline
role = get_execution_role()
session = sagemaker.Session()

In [2]:
df = pd.read_csv('final.csv')
df.drop(['Unnamed: 0'],axis = 1, inplace = True)

In [3]:
df.head()

Unnamed: 0,BaseOfCode,BaseOfData,Characteristics,DllCharacteristics,Entropy,FileAlignment,ImageBase,Label,Machine,NumberOfRvaAndSizes,...,xz,yapaxi,yaxpax,yaxpbd,yaxxz,zombie_gettypeinfo,zombie_gettypeinfocount,zwclose,zwopenkey,zwqueryvaluekey
0,4096.0,1851392.0,783.0,0.0,6.081747,512.0,4194304.0,0.0,332.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4096.0,40960.0,783.0,0.0,5.586422,512.0,4194304.0,0.0,332.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1359872.0,2138112.0,783.0,0.0,7.969464,512.0,4194304.0,0.0,332.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4096.0,40960.0,783.0,32768.0,7.9999,512.0,4194304.0,0.0,332.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,192512.0,245760.0,783.0,0.0,7.328245,512.0,4194304.0,0.0,332.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
b = list(df.columns)
b.remove('Label')
X = df[b]
y = df['Label']

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Testing set has {} samples.".format(X_test.shape[0]))
X_train,X_val,y_train,y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print("Training set has {} samples.".format(X_train.shape[0]))
print("Validation set has {} samples.".format(X_val.shape[0]))

Testing set has 15032 samples.
Training set has 28058 samples.
Validation set has 7015 samples.


In [6]:
X_test.to_csv('test.csv',header = False, index = False)
pd.concat([y_val,X_val],axis =1).to_csv('validation.csv',header=False, index=False)
pd.concat([y_train,X_train],axis =1).to_csv('train.csv',header = False, index = False)

In [8]:
prefix = 'malware-detection'
test_location = session.upload_data('test.csv',key_prefix = prefix)
val_location = session.upload_data('validation.csv',key_prefix = prefix)
train_location = session.upload_data('train.csv',key_prefix = prefix)

Since we are not fixated on the XGBoost Classifier, let us build an estimator

In [27]:
container = get_image_uri(session.boto_region_name,'xgboost')
xgb = sagemaker.estimator.Estimator(container,
                                   role,
                                   train_instance_count = 1,
                                   train_instance_type = 'ml.m4.xlarge',
                                   output_path = 's3://{}/{}/output'.format(session.default_bucket(),prefix),
                                   sagemaker_session = session)

	get_image_uri(region, 'xgboost', '0.90-1').


In [28]:
xgb.set_hyperparameters(max_depth = 5,
                       eta = 0.2,
                       gamma = 4,
                       min_chile_weight = 6,
                       subsample = 0.8,
                       objective = 'reg:linear',
                       early_stopping_rounds = 10,
                       num_round = 200)

In [29]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type = 'csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type = 'csv')

In [30]:
#xgb.fit({'train':s3_input_train,'validation':s3_input_validation})
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb,
                                              objective_metric_name = 'validation:rmse',#precision
                                              objective_type = 'Minimize', 
                                              max_jobs = 30,
                                              max_parallel_jobs = 3,
                                              hyperparameter_ranges = {
                                                  'max_depth':IntegerParameter(3,12),
                                                  'eta':ContinuousParameter(0.05,0.5),
                                                  'min_child_weight':IntegerParameter(2,8),
                                                  'subsample':ContinuousParameter(0.5,0.9),
                                                  'gamma':ContinuousParameter(0,10)
                                              } 
                                              )

In [31]:
xgb_hyperparameter_tuner.fit({'train':s3_input_train,'validation':s3_input_validation})

In [32]:
xgb_hyperparameter_tuner.wait()

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [33]:
#seeing the best tuner:
xgb_hyperparameter_tuner.best_training_job()

'xgboost-200315-1213-019-3ee3ccc4'

In [34]:
#attaching this to the current estimator
xgb_attached = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())

2020-03-15 13:13:43 Starting - Preparing the instances for training
2020-03-15 13:13:43 Downloading - Downloading input data
2020-03-15 13:13:43 Training - Training image download completed. Training in progress.
2020-03-15 13:13:43 Uploading - Uploading generated training model
2020-03-15 13:13:43 Completed - Training job completed[34mArguments: train[0m
[34m[2020-03-15:13:05:39:INFO] Running standalone xgboost training.[0m
[34m[2020-03-15:13:05:39:INFO] Setting up HPO optimized metric to be : rmse[0m
[34m[2020-03-15:13:05:39:INFO] File size need to be processed in the node: 294.29mb. Available memory size in the node: 8517.64mb[0m
[34m[2020-03-15:13:05:39:INFO] Determined delimiter of CSV input is ','[0m
[34m[13:05:39] S3DistributionType set as FullyReplicated[0m
[34m[13:05:40] 28058x2181 matrix with 61194498 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-03-15:13:05:40:INFO] Determined delimiter of CSV input is ','[0m


In [39]:
xgb_transform = xgb_attached.transformer(instance_count = 1,instance_type = 'ml.m4.xlarge')
xgb_transform.transform(test_location,content_type = 'text/csv',split_type = 'Line')



In [40]:
xgb_transform.wait()

....................[34mArguments: serve[0m
[34m[2020-03-15 14:00:53 +0000] [1] [INFO] Starting gunicorn 19.7.1[0m
[34m[2020-03-15 14:00:53 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2020-03-15 14:00:53 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2020-03-15 14:00:53 +0000] [38] [INFO] Booting worker with pid: 38[0m
[34m[2020-03-15 14:00:53 +0000] [39] [INFO] Booting worker with pid: 39[0m
[34m[2020-03-15 14:00:53 +0000] [40] [INFO] Booting worker with pid: 40[0m
[34m[2020-03-15:14:00:53:INFO] Model loaded successfully for worker : 38[0m
[34m[2020-03-15 14:00:53 +0000] [41] [INFO] Booting worker with pid: 41[0m
[34m[2020-03-15:14:00:53:INFO] Model loaded successfully for worker : 39[0m
[34m[2020-03-15:14:00:53:INFO] Model loaded successfully for worker : 40[0m
[34m[2020-03-15:14:00:53:INFO] Model loaded successfully for worker : 41[0m

[32m2020-03-15T14:01:05.606:[sagemaker logs]: MaxConcurrentTransforms=4, MaxPayloadInMB=6, BatchStrategy

In [47]:
!aws s3 cp --recursive $xgb_transform.output_path $'../output'

Completed 227.1 KiB/227.1 KiB (2.6 MiB/s) with 1 file(s) remainingdownload: s3://sagemaker-ap-south-1-812709844112/xgboost-200315-1213-019-3ee3ccc4-2020-03-15-13-57-45-224/test.csv.out to ../output/test.csv.out


In [53]:
y_pred = pd.read_csv('../output/test.csv.out',header = None)

In [61]:
for i in range(len(y_pred)):
    y_pred[0][i] = np.abs(np.round(y_pred[0][i]))

In [64]:
from sklearn.metrics import accuracy_score
print("Accuracy of the model: ",accuracy_score(y_test,y_pred))

Accuracy of the model:  0.9860298030867483
