## A Machine Learning approach for Malware Detection

Importing all the required libraries

In [83]:
# Import necessary libraries for data processing, machine learning, and feature selection, 
# including scikit-learn modules for classification models and tools.

import os
import pandas as pd
import numpy as np
import pickle
import pefile
import joblib
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing, tree, linear_model, svm

# Additional imports
from sklearn.linear_model import LinearRegression


Loading the initial dataset delimited by | 

In [84]:
# This code reads a CSV file named 'data.csv' into a Pandas DataFrame, using '|' as the separator and 
# managing low memory settings.

import pandas as pd 
dataset = pd.read_csv('data.csv', sep='|', low_memory=False)

In [85]:
# Display the first few rows of the 'dataset' DataFrame to inspect the data.

dataset.head()

Unnamed: 0,Name,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,...,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize,legitimate
0,memtest.exe,631ea355665f28d4707448e442fbf5b8,332,224,258,9,0,361984,115712,0,...,4,3.262823,2.568844,3.537939,8797.0,216,18032,0,16,1
1,ose.exe,9d10f99a6712e28f8acd5641e3a7ea6b,332,224,3330,9,0,130560,19968,0,...,2,4.250461,3.420744,5.080177,837.0,518,1156,72,18,1
2,setup.exe,4d92f518527353c0db88a70fddcfd390,332,224,3330,9,0,517120,621568,0,...,11,4.426324,2.846449,5.271813,31102.272727,104,270376,72,18,1
3,DW20.EXE,a41e524f8d45f0074fd07805ff0c9b12,332,224,258,9,0,585728,369152,0,...,10,4.364291,2.669314,6.40072,1457.0,90,4264,72,18,1
4,dwtrig20.exe,c87e561258f2f8650cef999bf643a731,332,224,258,9,0,294912,247296,0,...,2,4.3061,3.421598,5.190603,1074.5,849,1300,72,18,1


In [86]:
# Generate descriptive statistics of the 'dataset', providing summary statistics such as 
# mean, min, max, and quartiles for each column.

dataset.describe()

Unnamed: 0,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,...,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize,legitimate
count,138047.0,138047.0,138047.0,138047.0,138047.0,138047.0,138047.0,138047.0,138047.0,138047.0,...,138047.0,138047.0,138047.0,138047.0,138047.0,138047.0,138047.0,138047.0,138047.0,138047.0
mean,4259.069274,225.845632,4444.145994,8.619774,3.819286,242595.6,450486.7,100952.5,171956.1,57798.45,...,22.0507,4.000127,2.434541,5.52161,55450.93,18180.82,246590.3,465675.0,12.363115,0.29934
std,10880.347245,5.121399,8186.782524,4.088757,11.862675,5754485.0,21015990.0,16352880.0,3430553.0,5527658.0,...,136.494244,1.112981,0.815577,1.597403,7799163.0,6502369.0,21248600.0,26089870.0,6.798878,0.457971
min,332.0,224.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,332.0,224.0,258.0,8.0,0.0,30208.0,24576.0,0.0,12721.0,4096.0,...,5.0,3.458505,2.178748,4.828706,956.0,48.0,2216.0,0.0,13.0,0.0
50%,332.0,224.0,258.0,9.0,0.0,113664.0,263168.0,0.0,52883.0,4096.0,...,6.0,3.729824,2.458492,5.317552,2708.154,48.0,9640.0,72.0,15.0,0.0
75%,332.0,224.0,8226.0,10.0,0.0,120320.0,385024.0,0.0,61578.0,4096.0,...,13.0,4.233051,2.696833,6.502239,6558.429,132.0,23780.0,72.0,16.0,1.0
max,34404.0,352.0,49551.0,255.0,255.0,1818587000.0,4294966000.0,4294941000.0,1074484000.0,2028711000.0,...,7694.0,7.999723,7.999723,8.0,2415919000.0,2415919000.0,4294903000.0,4294967000.0,26.0,1.0


Number of malicious files vs Legitimate files in the training set

In [87]:
# Group the 'dataset' by the 'legitimate' column and count the size of each group.

dataset.groupby(dataset['legitimate']).size()

legitimate
0    96724
1    41323
dtype: int64

Dropping columns like Name of the file, MD5 (message digest) and label

In [88]:
# Create a feature matrix 'X' by excluding columns 'Name', 'md5', and 'legitimate', and 
# assign the target variable 'y' from the 'dataset'.

X = dataset.drop(['Name','md5','legitimate'],axis=1).values
y = dataset['legitimate'].values

##### ExtraTreesClassifier
ExtraTreesClassifier fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting

In [89]:
# Train an Extra Trees Classifier on the feature matrix 'X' and target variable 'y'.
extratrees = ExtraTreesClassifier().fit(X, y)

# Use SelectFromModel to select important features based on the trained classifier.
model = SelectFromModel(extratrees, prefit=True)

# Transform the original feature matrix 'X' to 'X_new' with the selected features.
X_new = model.transform(X)

# Capture the number of selected features in 'nbfeatures'.
nbfeatures = X_new.shape[1]


ExtraTreesClassifier helps in selecting the required features useful for classifying a file as either Malicious or Legitimate

14 features are identified as required by ExtraTreesClassifier

In [90]:
# The variable 'nbfeatures' holds the count of selected features after applying 
# feature selection using the Extra Trees Classifier.

nbfeatures

14

######  Cross Validation
Cross validation is applied to divide the dataset into random train and test subsets.
test_size = 0.2 represent the proportion of the dataset to include in the test split 

In [91]:
# Split the feature matrix 'X_new' and target variable 'y' into training and testing sets 
# using train_test_split, with a test size of 20%.

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2)

In [92]:
# Import 'numpy' as 'np', create an empty list 'features', and 
# find indices of the most important features.

import numpy as np  
features = []
index = np.argsort(extratrees.feature_importances_)[::-1][:nbfeatures]


The features identified by ExtraTreesClassifier

In [93]:
# Iterate through the selected features, print their ranking, name, and importance, and 
# append names to the 'features' list.

for f, idx in enumerate(index[:nbfeatures]):
    feature_name = dataset.columns[2 + idx]
    importance = extratrees.feature_importances_[idx]
    print(f"{f + 1}. feature {feature_name} ({importance})")
    features.append(feature_name)

1. feature Characteristics (0.15764588953434422)
2. feature DllCharacteristics (0.13290546776005654)
3. feature Machine (0.0976805205224497)
4. feature VersionInformationSize (0.05539161241235599)
5. feature Subsystem (0.05509997319785346)
6. feature SectionsMaxEntropy (0.05259281754615559)
7. feature ImageBase (0.049965571754027466)
8. feature SizeOfOptionalHeader (0.04129106517975645)
9. feature MajorSubsystemVersion (0.04110416253448962)
10. feature ResourcesMinEntropy (0.03888417531912302)
11. feature ResourcesMaxEntropy (0.038484705570560274)
12. feature SizeOfStackReserve (0.031745039629579225)
13. feature SectionsMinEntropy (0.024098240962738934)
14. feature MajorOperatingSystemVersion (0.02005149687445117)


Building the below Machine Learning model

In [94]:
# Define a dictionary of classification models for malware detection, 
# including Decision Tree, Random Forest, Gaussian Naive Bayes, ExtraTree, and AdaBoost.

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

model = {
    "DT": DecisionTreeClassifier(max_depth=10),  # Decision Tree
    "RF": RandomForestClassifier(n_estimators=50),  # Random Forest
    "GNB": GaussianNB(),  # Gaussian Naive Bayes
    "ExtraTree": ExtraTreesClassifier(),  # ExtraTree Classifier
    "AdaBoost": AdaBoostClassifier(n_estimators=50)  # AdaBoost
}


Training each of the model with the X_train and testing with X_test.
The model with best accuracy will be ranked as winner

In [98]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB


model["ExtraTree"] = ExtraTreesClassifier()
model["GNB"] = GaussianNB()

results = {}

for algo in model:
    clf = model[algo]
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall
    _score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print("%s Metrics:" % algo)
    print("Accuracy: %.2f" % accuracy)
    print("Precision: %.2f" % precision)
    print("Recall: %.2f" % recall)
    print("F1 Score: %.2f" % f1)
    
    classification_rep = classification_report(y_test, y_pred)
    print("Classification Report:\n", classification_rep)
    
    results[algo] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Classification Report': classification_rep
    }


DT Metrics:
Accuracy: 0.99
Precision: 0.99
Recall: 0.98
F1 Score: 0.98
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     19304
           1       0.99      0.98      0.98      8306

    accuracy                           0.99     27610
   macro avg       0.99      0.99      0.99     27610
weighted avg       0.99      0.99      0.99     27610

RF Metrics:
Accuracy: 0.99
Precision: 0.99
Recall: 0.99
F1 Score: 0.99
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00     19304
           1       0.99      0.99      0.99      8306

    accuracy                           0.99     27610
   macro avg       0.99      0.99      0.99     27610
weighted avg       0.99      0.99      0.99     27610

GNB Metrics:
Accuracy: 0.70
Precision: 1.00
Recall: 0.00
F1 Score: 0.00
Classification Report:
               precision    recall  f1-score   support

     

In [99]:
# Define a dictionary of classification models for malware detection.
models = {
    "Decision Tree": DecisionTreeClassifier(max_depth=10),
    "Random Forest": RandomForestClassifier(n_estimators=50),
    "Gaussian Naive Bayes": GaussianNB(),
    "ExtraTree": ExtraTreesClassifier(),
    "AdaBoost": AdaBoostClassifier(n_estimators=50)
}

# Extend the models dictionary with ExtraTree and Gaussian Naive Bayes instances.
models.update({"ExtraTree": ExtraTreesClassifier(), "GNB": GaussianNB()})

results = {}

# Evaluate and display metrics for each model in the dictionary.
for model_name, clf in models.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"{model_name} Metrics:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    
    classification_rep = classification_report(y_test, y_pred)
    print("Classification Report:\n", classification_rep)
    
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Classification Report': classification_rep
    }



Decision Tree Metrics:
Accuracy: 0.99
Precision: 0.99
Recall: 0.98
F1 Score: 0.98
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     19304
           1       0.99      0.98      0.98      8306

    accuracy                           0.99     27610
   macro avg       0.99      0.99      0.99     27610
weighted avg       0.99      0.99      0.99     27610

Random Forest Metrics:
Accuracy: 0.99
Precision: 0.99
Recall: 0.99
F1 Score: 0.99
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00     19304
           1       0.99      0.99      0.99      8306

    accuracy                           0.99     27610
   macro avg       0.99      0.99      0.99     27610
weighted avg       0.99      0.99      0.99     27610

Gaussian Naive Bayes Metrics:
Accuracy: 0.70
Precision: 1.00
Recall: 0.00
F1 Score: 0.00
Classification Report:
               precisi

In [17]:
winner = max(results, key=results.get)

Saving the model

In [18]:
import os
import joblib

# Assuming 'classifier' directory doesn't exist, create it
directory = 'classifier'
if not os.path.exists(directory):
    os.makedirs(directory)

# Save the model to the specified directory
joblib.dump(model[winner], os.path.join(directory, 'classifier.pkl'))


['classifier\\classifier.pkl']

In [19]:
with open('classifier/features.pkl', 'wb') as file:
    pickle.dump(features, file)


Calculating the False positive and negative on the dataset

In [20]:
clf = model[winner]
res = clf.predict(X_new)
mt = confusion_matrix(y, res)
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100))
print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))

False positive rate : 0.085811 %
False negative rate : 0.154877 %


In [21]:
import os
import joblib
import pickle

# Load classifier
clf = joblib.load('classifier/classifier.pkl')

# Load features
with open('classifier/features.pkl', 'rb') as file:
    features = pickle.load(file)


##### Testing with unseen file
Given any unseen test file, it's required to extract the characteristics of the given file.  

In order to test the model on an unseen file, it's required to extract the characteristics of the given file. Python's pefile.PE library is used to construct and build the feature vector and a ML model is used to predict the class for the given file based on the already trained model. 

In [22]:
import pefile
import os
import array
import math
import pickle
import joblib
import sys

def get_entropy(data):
    if not data:
        return 0.0
    occurrences = array.array('L', [0] * 256)
    for x in data:
        occurrences[x if isinstance(x, int) else ord(x)] += 1

    entropy = sum(-p_x * math.log(p_x, 2) for p_x in (float(x) / len(data) for x in occurrences) if p_x)
    return entropy

def get_resources(pe):
    resources = []
    if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
        try:
            for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
                if hasattr(resource_type, 'directory'):
                    for resource_id in resource_type.directory.entries:
                        if hasattr(resource_id, 'directory'):
                            for resource_lang in resource_id.directory.entries:
                                data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size)
                                resources.append([get_entropy(data), resource_lang.data.struct.Size])
        except Exception as e:
            pass
    return resources

def get_version_info(pe):
    res = {}
    for fileinfo in pe.FileInfo:
        if fileinfo.Key in ('StringFileInfo', 'VarFileInfo'):
            res.update({entry[0]: entry[1] for st in fileinfo.StringTable for entry in st.entries.items()})
        if fileinfo.Key == 'VarFileInfo':
            res.update({var.entry.items()[0][0]: var.entry.items()[0][1] for var in fileinfo.Var})
    if hasattr(pe, 'VS_FIXEDFILEINFO'):
        res.update({
            'flags': pe.VS_FIXEDFILEINFO.FileFlags,
            'os': pe.VS_FIXEDFILEINFO.FileOS,
            'type': pe.VS_FIXEDFILEINFO.FileType,
            'file_version': pe.VS_FIXEDFILEINFO.FileVersionLS,
            'product_version': pe.VS_FIXEDFILEINFO.ProductVersionLS,
            'signature': pe.VS_FIXEDFILEINFO.Signature,
            'struct_version': pe.VS_FIXEDFILEINFO.StrucVersion
        })
    return res

def extract_infos(fpath):
    res = {}
    pe = pefile.PE(fpath)
    optional_header = pe.OPTIONAL_HEADER
    file_header = pe.FILE_HEADER

    res.update({
        'Machine': file_header.Machine,
        'SizeOfOptionalHeader': file_header.SizeOfOptionalHeader,
        'Characteristics': file_header.Characteristics,
        'MajorLinkerVersion': optional_header.MajorLinkerVersion,
        'MinorLinkerVersion': optional_header.MinorLinkerVersion,
        'SizeOfCode': optional_header.SizeOfCode,
        'SizeOfInitializedData': optional_header.SizeOfInitializedData,
        'SizeOfUninitializedData': optional_header.SizeOfUninitializedData,
        'AddressOfEntryPoint': optional_header.AddressOfEntryPoint,
        'BaseOfCode': optional_header.BaseOfCode,
        'BaseOfData': getattr(optional_header, 'BaseOfData', 0),
        'ImageBase': optional_header.ImageBase,
        'SectionAlignment': optional_header.SectionAlignment,
        'FileAlignment': optional_header.FileAlignment,
        'MajorOperatingSystemVersion': optional_header.MajorOperatingSystemVersion,
        'MinorOperatingSystemVersion': optional_header.MinorOperatingSystemVersion,
        'MajorImageVersion': optional_header.MajorImageVersion,
        'MinorImageVersion': optional_header.MinorImageVersion,
        'MajorSubsystemVersion': optional_header.MajorSubsystemVersion,
        'MinorSubsystemVersion': optional_header.MinorSubsystemVersion,
        'SizeOfImage': optional_header.SizeOfImage,
        'SizeOfHeaders': optional_header.SizeOfHeaders,
        'CheckSum': optional_header.CheckSum,
        'Subsystem': optional_header.Subsystem,
        'DllCharacteristics': optional_header.DllCharacteristics,
        'SizeOfStackReserve': optional_header.SizeOfStackReserve,
        'SizeOfStackCommit': optional_header.SizeOfStackCommit,
        'SizeOfHeapReserve': optional_header.SizeOfHeapReserve,
        'SizeOfHeapCommit': optional_header.SizeOfHeapCommit,
        'LoaderFlags': optional_header.LoaderFlags,
        'NumberOfRvaAndSizes': optional_header.NumberOfRvaAndSizes,
    })

    # Sections
    sections = pe.sections
    res.update({
        'SectionsNb': len(sections),
        'SectionsMeanEntropy': sum(x.get_entropy() for x in sections) / float(len(sections)),
        'SectionsMinEntropy': min(x.get_entropy() for x in sections),
        'SectionsMaxEntropy': max(x.get_entropy() for x in sections),
        'SectionsMeanRawsize': sum(x.SizeOfRawData for x in sections) / float(len(sections)),
        'SectionsMinRawsize': min(x.SizeOfRawData for x in sections),
        'SectionsMaxRawsize': max(x.SizeOfRawData for x in sections),
        'SectionsMeanVirtualsize': sum(x.Misc_VirtualSize for x in sections) / float(len(sections)),
        'SectionsMinVirtualsize': min(x.Misc_VirtualSize for x in sections),
        'SectionMaxVirtualsize': max(x.Misc_VirtualSize for x in sections),
    })

    # Imports
    try:
        imports = sum((x.imports for x in pe.DIRECTORY_ENTRY_IMPORT), [])
        res.update({
            'ImportsNbDLL': len(pe.DIRECTORY_ENTRY_IMPORT),
            'ImportsNb': len(imports),
            'ImportsNbOrdinal': len([x for x in imports if x.name is None]),
        })
    except AttributeError:
        res.update({'ImportsNbDLL': 0, 'ImportsNb': 0, 'ImportsNbOrdinal': 0})

    # Exports
    try:
        res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols)
    except AttributeError:
        res['ExportNb'] = 0

    # Resources
    resources = get_resources(pe)
    res.update({
        'ResourcesNb': len(resources),
        'ResourcesMeanEntropy': sum(x[0] for x in resources) / float(len(resources)) if resources else 0,
        'ResourcesMinEntropy': min(x[0] for x in resources) if resources else 0,
        'ResourcesMaxEntropy': max(x[0] for x in resources) if resources else 0,
        'ResourcesMeanSize': sum(x[1] for x in resources) / float(len(resources)) if resources else 0,
        'ResourcesMinSize': min(x[1] for x in resources) if resources else 0,
        'ResourcesMaxSize': max(x[1] for x in resources) if resources else 0,
    })

    # Load configuration size
    try:
        res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size
    except AttributeError:
        res['LoadConfigurationSize'] = 0

    # Version configuration size
    try:
        version_infos = get_version_info(pe)
        res['VersionInformationSize'] = len(version_infos)
    except AttributeError:
        res['VersionInformationSize'] = 0

    return res

if __name__ == '__main__':
    clf = joblib.load('classifier/classifier.pkl')
    features = pickle.loads(open(os.path.join('classifier/features.pkl'), 'rb').read())
    # Provide the file path directly here in the notebook
    file_path = 'mspaint.exe'  # Replace with the actual path
    data = extract_infos(file_path)
    pe_features = [data[x] for x in features]
    res = clf.predict([pe_features])[0]    
    print(f'The file {os.path.basename(file_path)} is {"malicious" if res else "legitimate"}')



The file mspaint.exe is legitimate


In [36]:
import pefile
import os
import array
import math
import pickle
import joblib

def get_entropy(data):
    return sum(-p_x * math.log(p_x, 2) for p_x in (float(data.count(x)) / len(data) for x in set(data)) if p_x)

def get_resources(pe):
    resources = []
    if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
        try:
            resources = [[get_entropy(pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size)),
                          resource_lang.data.struct.Size]
                         for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries
                         if hasattr(resource_type, 'directory')
                         for resource_id in resource_type.directory.entries
                         if hasattr(resource_id, 'directory')
                         for resource_lang in resource_id.directory.entries]
        except Exception:
            pass
    return resources

def get_version_info(pe):
    res = {}
    for fileinfo in pe.FileInfo:
        if fileinfo.Key in ('StringFileInfo', 'VarFileInfo'):
            res.update({entry[0]: entry[1] for st in fileinfo.StringTable for entry in st.entries.items()})
        if fileinfo.Key == 'VarFileInfo':
            res.update({var.entry.items()[0][0]: var.entry.items()[0][1] for var in fileinfo.Var})
    if hasattr(pe, 'VS_FIXEDFILEINFO'):
        res.update({field: getattr(pe.VS_FIXEDFILEINFO, field) for field in ['FileFlags', 'FileOS', 'FileType',
                                                                               'FileVersionLS', 'ProductVersionLS',
                                                                               'Signature', 'StrucVersion']})
    return res

def extract_infos(fpath):
    pe = pefile.PE(fpath)
    optional_header, file_header = pe.OPTIONAL_HEADER, pe.FILE_HEADER

    res = {
        'Machine': file_header.Machine,
        'SizeOfOptionalHeader': file_header.SizeOfOptionalHeader,
        'Characteristics': file_header.Characteristics,
        'MajorLinkerVersion': optional_header.MajorLinkerVersion,
        'MinorLinkerVersion': optional_header.MinorLinkerVersion,
        'SizeOfCode': optional_header.SizeOfCode,
        'SizeOfInitializedData': optional_header.SizeOfInitializedData,
        'SizeOfUninitializedData': optional_header.SizeOfUninitializedData,
        'AddressOfEntryPoint': optional_header.AddressOfEntryPoint,
        'BaseOfCode': optional_header.BaseOfCode,
        'BaseOfData': getattr(optional_header, 'BaseOfData', 0),
        'ImageBase': optional_header.ImageBase,
        'SectionAlignment': optional_header.SectionAlignment,
        'FileAlignment': optional_header.FileAlignment,
        'MajorOperatingSystemVersion': optional_header.MajorOperatingSystemVersion,
        'MinorOperatingSystemVersion': optional_header.MinorOperatingSystemVersion,
        'MajorImageVersion': optional_header.MajorImageVersion,
        'MinorImageVersion': optional_header.MinorImageVersion,
        'MajorSubsystemVersion': optional_header.MajorSubsystemVersion,
        'MinorSubsystemVersion': optional_header.MinorSubsystemVersion,
        'SizeOfImage': optional_header.SizeOfImage,
        'SizeOfHeaders': optional_header.SizeOfHeaders,
        'CheckSum': optional_header.CheckSum,
        'Subsystem': optional_header.Subsystem,
        'DllCharacteristics': optional_header.DllCharacteristics,
        'SizeOfStackReserve': optional_header.SizeOfStackReserve,
        'SizeOfStackCommit': optional_header.SizeOfStackCommit,
        'SizeOfHeapReserve': optional_header.SizeOfHeapReserve,
        'SizeOfHeapCommit': optional_header.SizeOfHeapCommit,
        'LoaderFlags': optional_header.LoaderFlags,
        'NumberOfRvaAndSizes': optional_header.NumberOfRvaAndSizes,
    }

    sections = pe.sections
    res.update({
        'SectionsNb': len(sections),
        'SectionsMeanEntropy': sum(x.get_entropy() for x in sections) / len(sections),
        'SectionsMinEntropy': min(x.get_entropy() for x in sections),
        'SectionsMaxEntropy': max(x.get_entropy() for x in sections),
        'SectionsMeanRawsize': sum(x.SizeOfRawData for x in sections) / len(sections),
        'SectionsMinRawsize': min(x.SizeOfRawData for x in sections),
        'SectionsMaxRawsize': max(x.SizeOfRawData for x in sections),
        'SectionsMeanVirtualsize': sum(x.Misc_VirtualSize for x in sections) / len(sections),
        'SectionsMinVirtualsize': min(x.Misc_VirtualSize for x in sections),
        'SectionMaxVirtualsize': max(x.Misc_VirtualSize for x in sections),
    })

    try:
        imports = sum((x.imports for x in pe.DIRECTORY_ENTRY_IMPORT), [])
        res.update({
            'ImportsNbDLL': len(pe.DIRECTORY_ENTRY_IMPORT),
            'ImportsNb': len(imports),
            'ImportsNbOrdinal': len([x for x in imports if x.name is None]),
        })
    except AttributeError:
        res.update({'ImportsNbDLL': 0, 'ImportsNb': 0, 'ImportsNbOrdinal': 0})

    try:
        res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols)
    except AttributeError:
        res['ExportNb'] = 0

    resources = get_resources(pe)
    res.update({
        'ResourcesNb': len(resources),
        'ResourcesMeanEntropy': sum(x[0] for x in resources) / len(resources) if resources else 0,
        'ResourcesMinEntropy': min(x[0] for x in resources) if resources else 0,
        'ResourcesMaxEntropy': max(x[0] for x in resources) if resources else 0,
        'ResourcesMeanSize': sum(x[1] for x in resources) / len(resources) if resources else 0,
        'ResourcesMinSize': min(x[1] for x in resources) if resources else 0,
        'ResourcesMaxSize': max(x[1] for x in resources) if resources else 0,
    })

    try:
        res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size
    except AttributeError:
        res['LoadConfigurationSize'] = 0

    try:
        version_infos = get_version_info(pe)
        res['VersionInformationSize'] = len(version_infos)
    except AttributeError:
        res['VersionInformationSize'] = 0

    return res

if __name__ == '__main__':
    clf = joblib.load('classifier/classifier.pkl')
    features = pickle.loads(open(os.path.join('classifier/features.pkl'), 'rb').read())
    file_path = 'mspaint.exe'  # Replace with the actual path
    res = clf.predict([[extract_infos(file_path)[x] for x in features]])[0]
    print(f'The file {os.path.basename(file_path)} is {"malicious" if res else "legitimate"}')


The file mspaint.exe is legitimate


Let's run the program to test the file - Skype.exe

In [None]:
%run malware_test.py "mspaint.exe"

Exception: File `'malware_test.py'` not found.

To test for the malicious file, an application has been downloaded from malwr.com

In [None]:
%run malware_test.py "/home/surajr/Downloads/BCN12ui49823.exe"

The file BCN12ui49823.exe is malicious
