In [1]:
import os
import sklearn.model_selection
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import MinMaxScaler
from itertools import islice
from sagemaker.predictor import csv_serializer
%matplotlib inline
role = get_execution_role()
session = sagemaker.Session()

In [2]:
role = get_execution_role()
bucket='sagemaker-ap-south-1-812709844112'
data_key = 'brazilian-malware.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

df_Csv=pd.read_csv(data_location)

In [3]:
df_Csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50181 entries, 0 to 50180
Data columns (total 28 columns):
BaseOfCode                 50181 non-null int64
BaseOfData                 50181 non-null int64
Characteristics            50181 non-null int64
DllCharacteristics         50181 non-null int64
Entropy                    50181 non-null float64
FileAlignment              50181 non-null int64
FirstSeenDate              50181 non-null object
Identify                   35958 non-null object
ImageBase                  50181 non-null int64
ImportedDlls               50181 non-null object
ImportedSymbols            50181 non-null object
Label                      50181 non-null int64
Machine                    50181 non-null int64
Magic                      50181 non-null int64
NumberOfRvaAndSizes        50181 non-null int64
NumberOfSections           50181 non-null int64
NumberOfSymbols            50181 non-null int64
PE_TYPE                    50181 non-null int64
PointerToSymbolTable 

In [4]:
df_Csv = df_Csv.drop(columns=['SHA1', 'FirstSeenDate', 'Identify', 'PE_TYPE', 'SizeOfOptionalHeader', 'Magic'])

In [5]:
df_Csv['ImportedDlls'] = df_Csv['ImportedDlls'].apply(lambda x: ', '.join(sorted(set(x.split(' ')))))
df_Csv['ImportedDlls'] = df_Csv['ImportedDlls'].str.lower()
df_Csv['ImportedSymbols'] = df_Csv['ImportedSymbols'].str.lower()
df_Csv['ImportedSymbols'] = df_Csv['ImportedSymbols'].str.replace(' ', ', ')

for i in range(len(df_Csv)):
    if '.' not in df_Csv['ImportedDlls'][i]:
        df_Csv.drop(i, axis=0, inplace=True)

In [6]:
cvec = CountVectorizer(stop_words='english', min_df=0.001, max_df=0.8, ngram_range=(1,1), token_pattern = '[a-zA-Z0-9$&+:;=?@#|<>.^*()%!-]+')
cvec.fit(df_Csv.ImportedDlls)
list(islice(cvec.vocabulary_.items(), 20))
len(cvec.vocabulary_)

235

In [7]:
cvec_counts = cvec.transform(df_Csv.ImportedDlls)
print('sparse matrix shape:', cvec_counts.shape)
print('nonzero count:', cvec_counts.nnz)
print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

sparse matrix shape: (50143, 235)
nonzero count: 407391
sparsity: 3.46%


In [8]:
occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=True).head(10)

Unnamed: 0,term,occurrences
169,msvcp110,51
103,api-ms-win-security-credentials-l1-1-0.dll,52
99,api-ms-win-power-base-l1-1-0.dll,52
74,api-ms-win-core-sysinfo-l1-2-0.dll,52
114,api-ms-win-shcore-obsolete-l1-1-0.dll,52
228,wkscli.dll,52
153,libeay32.dll,53
186,odbc32.dll,53
11,api-ms-win-core-console-l2-1-0.dll,53
13,api-ms-win-core-crt-l2-1-0.dll,53


In [9]:
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(cvec_counts)
transformed_weights

weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df1 = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
weights_df1.sort_values(by='weight', ascending=False).head(50)

Unnamed: 0,term,weight
151,kernel32.dll,0.201134
211,user32.dll,0.168716
2,advapi32.dll,0.165414
189,oleaut32.dll,0.153362
124,comctl32.dll,0.150143
142,gdi32.dll,0.133192
187,ole32.dll,0.121815
216,version.dll,0.11705
162,mscoree.dll,0.114623
204,shell32.dll,0.111482


In [10]:
cvec = CountVectorizer(stop_words='english', min_df=0.005, max_df=0.9, ngram_range=(1,1))
cvec.fit(df_Csv.ImportedSymbols)
list(islice(cvec.vocabulary_.items(), 20))
len(cvec.vocabulary_)

1935

In [11]:
cvec_counts = cvec.transform(df_Csv.ImportedSymbols)
print('sparse matrix shape:', cvec_counts.shape)
print('nonzero count:', cvec_counts.nnz)
print('sparsity: %.2f%%' % (100.0 * cvec_counts.nnz / (cvec_counts.shape[0] * cvec_counts.shape[1])))

sparse matrix shape: (50143, 1935)
nonzero count: 8174793
sparsity: 8.43%


In [12]:
occ = np.asarray(cvec_counts.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': cvec.get_feature_names(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=True).head(60)

Unnamed: 0,term,occurrences
746,getcomboboxinfo,251
11,__crtunhandledexception,251
10,__crtterminateprocess,251
1662,setupdienumdeviceinfo,252
1533,rtlntstatustodoserrornoteb,252
1372,pathremoveextensionw,253
1731,startservicectrldispatchera,255
87,__vbaraiseevent,255
702,gdipcreatestringformat,255
234,_wtoi64,255


In [13]:
otransformer = TfidfTransformer()
transformed_weights = otransformer.fit_transform(cvec_counts)
transformed_weights

weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})
weights_df.sort_values(by='weight', ascending=False).head(50)

Unnamed: 0,term,weight
166,_cordllmain,0.082428
893,getprocaddress,0.048622
1845,virtualalloc,0.043744
1208,loadlibrarya,0.041942
1847,virtualfree,0.040924
866,getmodulehandlea,0.036933
1718,sleep,0.034806
1426,regclosekey,0.033832
1909,writefile,0.032667
621,exitprocess,0.031943


In [14]:
l1 = list(weights_df1.term)
l2 = list(weights_df.term)

count1 = 0
df2 = pd.DataFrame(0, index=[i for i in range (1, 50144)], columns=l1)
df3 = pd.DataFrame(0, index=[i for i in range (1, 50144)], columns=l2)

for i in df_Csv['ImportedDlls']:
    count1 +=1
    temp1 = i.split()

    for j in temp1:
        for col in df2.columns:
            if j[:-1] == col:
                df2[col][count1] = 1


In [15]:
df2 = df2.loc[:, (df2 != 0).any(axis=0)]
df2.describe()

Unnamed: 0,activeds.dll,advapi32.dll,advpack.dll,api-ms-win-appmodel-runtime-l1-1-1.dll,api-ms-win-core-apiquery-l1-1-0.dll,api-ms-win-core-atoms-l1-1-0.dll,api-ms-win-core-com-l1-1-0.dll,api-ms-win-core-com-l1-1-1.dll,api-ms-win-core-com-midlproxystub-l1-1-0.dll,api-ms-win-core-console-l1-1-0.dll,...,wininet.dll,winmm.dll,winspool.drv,winsta.dll,wintrust.dll,wkscli.dll,wldap32.dll,wsock32.dll,wtsapi32.dll,xmllite.dll
count,50143.0,50143.0,50143.0,50143.0,50143.0,50143.0,50143.0,50143.0,50143.0,50143.0,...,50143.0,50143.0,50143.0,50143.0,50143.0,50143.0,50143.0,50143.0,50143.0,50143.0
mean,0.002573,0.666095,0.001436,0.001496,0.009892,0.001815,0.002393,0.023273,0.004507,0.003769,...,0.114612,0.096325,0.029815,0.001117,0.001336,0.000299,0.001077,0.001835,0.000399,6e-05
std,0.050656,0.471611,0.037866,0.038646,0.098965,0.042562,0.048862,0.150772,0.066984,0.061279,...,0.318557,0.295039,0.170078,0.0334,0.03653,0.017293,0.032799,0.042795,0.019968,0.007735
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


The following code might take a little longer to run, as it has to work on 50000+ rows and 2000+ columns

In [None]:
count1 = -1
for i in df_Csv['ImportedSymbols']:
    count1 +=1
    temp1 = i.split()
    for j in temp1:
        if j[:-1] in list(df3.columns):
            df3[j[:-1]][count1 = 1]

In [24]:
frame_List = [df_Csv, df2, df3]
final_Csv2 = pd.concat(frame_List, axis=1)

MemoryError: 

In [21]:
final_Csv2 = final_Csv2.dropna()
final_Csv2 = final_Csv2.drop(['ImportedDlls', 'ImportedSymbols'], axis=1)

MemoryError: 

In [28]:
final_Csv2

Unnamed: 0,BaseOfCode,BaseOfData,Characteristics,DllCharacteristics,Entropy,FileAlignment,ImageBase,ImportedDlls,ImportedSymbols,Label,...,xz,yapaxi,yaxpax,yaxpbd,yaxxz,zombie_gettypeinfo,zombie_gettypeinfocount,zwclose,zwopenkey,zwqueryvaluekey
0,4096.0,69632.0,783.0,0.0,5.981249,512.0,4194304.0,"comctl32.dll, comdlg32.dll, gdi32.dll, kernel3...","printdlga, getopenfilenamea, getsavefilenamea,...",0.0,...,,,,,,,,,,
1,4096.0,1851392.0,783.0,0.0,6.081747,512.0,4194304.0,"comctl32.dll, comdlg32.dll, gdi32.dll, kernel3...","imagelist_add, imagelist_addmasked, imagelist_...",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4096.0,40960.0,783.0,0.0,5.586422,512.0,4194304.0,"comdlg32.dll, kernel32.dll, msvcrt.dll, user32...","getopenfilenamea, closehandle, createfilea, cr...",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1359872.0,2138112.0,783.0,0.0,7.969464,512.0,4194304.0,"advapi32.dll, comdlg32.dll, gdi32.dll, kernel3...","loadlibrarya, getprocaddress, virtualprotect, ...",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4096.0,40960.0,783.0,32768.0,7.999900,512.0,4194304.0,"advapi32.dll, comctl32.dll, gdi32.dll, kernel3...","regclosekey, regcreatekeyexa, regdeletekeya, r...",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,192512.0,245760.0,783.0,0.0,7.328245,512.0,4194304.0,"advapi32.dll, comctl32.dll, comdlg32.dll, gdi3...","loadlibrarya, getprocaddress, virtualprotect, ...",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,8192.0,61440.0,33166.0,0.0,6.257786,512.0,4194304.0,"kernel32.dll, user32.dll","getstringtypew, lcmapstringw, virtualalloc, vi...",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,4096.0,40960.0,775.0,0.0,5.308237,512.0,4194304.0,"kernel32.dll, lua53.dll, msvcrt.dll","lual_callmeta, lual_checkstack, lual_checkvers...",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,4096.0,40960.0,775.0,0.0,5.256822,512.0,4194304.0,"kernel32.dll, lua53.dll, msvcrt.dll","lual_callmeta, lual_checkstack, lual_checkvers...",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,4096.0,131072.0,775.0,0.0,5.909817,512.0,4194304.0,"kernel32.dll, msvcrt.dll","deletecriticalsection, entercriticalsection, e...",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
