In [66]:
import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import accuracy_score
from sklearn.metrics import silhouette_score

In [67]:
df = pd.read_csv("./file_pe_headers.csv", sep=",")

In [68]:
df.head()

Unnamed: 0,Name,e_magic,e_cblp,e_cp,e_crlc,e_cparhdr,e_minalloc,e_maxalloc,e_ss,e_sp,...,SectionMaxChar,SectionMainChar,DirectoryEntryImport,DirectoryEntryImportSize,DirectoryEntryExport,ImageDirectoryEntryExport,ImageDirectoryEntryImport,ImageDirectoryEntryResource,ImageDirectoryEntryException,ImageDirectoryEntrySecurity
0,VirusShare_a878ba26000edaac5c98eff4432723b3,23117,144,3,0,4,0,65535,0,184,...,3758096608,0,7,152,0,0,54440,77824,73728,0
1,VirusShare_ef9130570fddc174b312b2047f5f4cf0,23117,144,3,0,4,0,65535,0,184,...,3791650880,0,16,311,0,0,262276,294912,0,346112
2,VirusShare_ef84cdeba22be72a69b198213dada81a,23117,144,3,0,4,0,65535,0,184,...,3221225536,0,6,176,0,0,36864,40960,0,0
3,VirusShare_6bf3608e60ebc16cbcff6ed5467d469e,23117,144,3,0,4,0,65535,0,184,...,3224371328,0,8,155,0,0,356352,1003520,0,14109472
4,VirusShare_2cc94d952b2efb13c7d6bbe0dd59d3fb,23117,144,3,0,4,0,65535,0,184,...,3227516992,0,2,43,0,0,61440,73728,0,90624


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19611 entries, 0 to 19610
Data columns (total 79 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Name                          19611 non-null  object 
 1   e_magic                       19611 non-null  int64  
 2   e_cblp                        19611 non-null  int64  
 3   e_cp                          19611 non-null  int64  
 4   e_crlc                        19611 non-null  int64  
 5   e_cparhdr                     19611 non-null  int64  
 6   e_minalloc                    19611 non-null  int64  
 7   e_maxalloc                    19611 non-null  int64  
 8   e_ss                          19611 non-null  int64  
 9   e_sp                          19611 non-null  int64  
 10  e_csum                        19611 non-null  int64  
 11  e_ip                          19611 non-null  int64  
 12  e_cs                          19611 non-null  int64  
 13  e

In [70]:
df["Malware"] = df["Malware"].astype("category")

In [71]:
y = df["Malware"]
X = df.drop(["Name", "Malware"], axis=1)

In [72]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: Malware, dtype: category
Categories (2, int64): [0, 1]

In [73]:
X.head()

Unnamed: 0,e_magic,e_cblp,e_cp,e_crlc,e_cparhdr,e_minalloc,e_maxalloc,e_ss,e_sp,e_csum,...,SectionMaxChar,SectionMainChar,DirectoryEntryImport,DirectoryEntryImportSize,DirectoryEntryExport,ImageDirectoryEntryExport,ImageDirectoryEntryImport,ImageDirectoryEntryResource,ImageDirectoryEntryException,ImageDirectoryEntrySecurity
0,23117,144,3,0,4,0,65535,0,184,0,...,3758096608,0,7,152,0,0,54440,77824,73728,0
1,23117,144,3,0,4,0,65535,0,184,0,...,3791650880,0,16,311,0,0,262276,294912,0,346112
2,23117,144,3,0,4,0,65535,0,184,0,...,3221225536,0,6,176,0,0,36864,40960,0,0
3,23117,144,3,0,4,0,65535,0,184,0,...,3224371328,0,8,155,0,0,356352,1003520,0,14109472
4,23117,144,3,0,4,0,65535,0,184,0,...,3227516992,0,2,43,0,0,61440,73728,0,90624


In [74]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### K Means Centroid base

In [75]:
km_estimator = KMeans(n_clusters=len(set(y)))
km_estimator.fit(X)





In [76]:
labels = km_estimator.labels_

# check how many of the samples were correctly labeled
correct_labels = sum(y == labels)

print("Result: %d out of %d samples were correctly labeled." % (correct_labels, y.size))

Result: 4933 out of 19611 samples were correctly labeled.


In [77]:
print('Accuracy score: {0:0.2f}'. format(correct_labels/float(y.size)))

Accuracy score: 0.25


### DBSCAN density based

In [78]:
db_estimator = DBSCAN(eps=10, min_samples=50).fit(X)

In [79]:
db_labels = db_estimator.labels_

db_correct_labels = sum(y == db_labels)

print("Result: %d out of %d samples were correctly labeled." % (db_correct_labels, y.size))

Result: 108 out of 19611 samples were correctly labeled.


In [80]:
print('Accuracy score: {0:0.2f}'. format(db_correct_labels/float(y.size)))

Accuracy score: 0.01


### AGGLOMERATIVE hierarchical

In [81]:
ac_estimator = AgglomerativeClustering(n_clusters=len(set(y)))
ac_estimator.fit(X)

In [82]:
ac_labels = ac_estimator.labels_

ac_correct_labels = sum(y == ac_labels)

print("Result: %d out of %d samples were correctly labeled." % (ac_correct_labels, y.size))

Result: 14678 out of 19611 samples were correctly labeled.


In [83]:
print('Accuracy score: {0:0.2f}'. format(ac_correct_labels/float(y.size)))

Accuracy score: 0.75


### distribution base  

In [84]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=len(set(y)))
gmm.fit(X)

In [None]:
gmm_labels = gmm_estimator.labels_

gmm_correct_labels = sum(y == ac_labels)

print("Result: %d out of %d samples were correctly labeled." % (ac_correct_labels, y.size))