In [1]:
pip install pefile


Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.




[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
import pefile
import hashlib
import re

def extract_features(file_path):
    features = {}
    
    # FileName
    features['FileName'] = file_path
  # md5Hash
    with open(file_path, "rb") as f:
        file_content = f.read()
        features['md5Hash'] = hashlib.md5(file_content).hexdigest()
    
    pe = pefile.PE(file_path)
    
    # Machine
    features['Machine'] = pe.FILE_HEADER.Machine
    
    # DebugSize and DebugRVA
    features['DebugSize'] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_DEBUG']].Size

    features['DebugRVA'] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_DEBUG']].VirtualAddress
    
    # MajorImageVersion
    features['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion
    
    # MajorOSVersion
    features['MajorOSVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
    
    # ExportRVA and ExportSize
    features['ExportRVA'] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT']].VirtualAddress
    features['ExportSize'] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT']].Size
        # IatVRA
    features['IatVRA'] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_IAT']].VirtualAddress
    
    # MajorLinkerVersion and MinorLinkerVersion
    features['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion
    features['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion
    
    # NumberOfSections
    features['NumberOfSections'] = pe.FILE_HEADER.NumberOfSections
    
    # SizeOfStackReserve
    features['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve
        # DllCharacteristics
    features['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics
    
    # ResourceSize
    features['ResourceSize'] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE']].Size
    
    # BitcoinAddresses
    bitcoin_regex = re.compile(r'([13][a-km-zA-HJ-NP-Z1-9]{25,34})')
    features['BitcoinAddresses'] = bitcoin_regex.findall(file_content.decode('utf-8', errors='ignore'))
    
    # Benign (this would normally be provided by your labeled dataset)
    features['Benign'] = 0  # Example default value
    
    return features

# Example usage
file_path = r'C:\Program Files (x86)\RapidTyping 5\RapidTyping.exe' #a single exe is tested just as an example
features = extract_features(file_path)
print(features)


{'FileName': 'C:\\Program Files (x86)\\RapidTyping 5\\RapidTyping.exe', 'md5Hash': 'bf2cd6de94b4ed9d407eb713d0b19161', 'Machine': 332, 'DebugSize': 28, 'DebugRVA': 1738592, 'MajorImageVersion': 0, 'MajorOSVersion': 5, 'ExportRVA': 0, 'ExportSize': 0, 'IatVRA': 1552384, 'MajorLinkerVersion': 14, 'MinorLinkerVersion': 0, 'NumberOfSections': 6, 'SizeOfStackReserve': 1048576, 'DllCharacteristics': 33024, 'ResourceSize': 185624, 'BitcoinAddresses': ['333333333333333333333333333333n3333', '33333333333333333333333333333333333', '33333333333333333333333333333333333', '33333333333333333333333333333333333', '33333333333333333333333333333333333', '33333333333333333333333333333333333', '33333333333333333333333333333333333', '33333333333333333333333333333333333', '33333333333333333333GV3333333333333', '3333333333kj533333333333333333333L', '333333333333333333P3333333333333333', '3333333333333333aK3333333333333333', '33333333333333ya33333fo833333333333', '3333337gVis633333333333333333333333', '333333

# For saving extracted features in Custom Dataset

In [10]:
#save the features to a file data.csv
import csv
custom_datasetPath= r'C:\Users\admin\Desktop\RansomwareDetection\Dataset\data_file.csv\customDataset.csv'
with open(custom_datasetPath, 'w') as f:
    writer = csv.DictWriter(f, fieldnames=features.keys())
    writer.writeheader()
    writer.writerow(features)


In [None]:
!pip install xgboost



In [30]:
!pip install joblib

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Load and preprocess Data

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import xgboost as xgb
import joblib
from scipy.sparse import hstack

# Load dataset
df = pd.read_csv(r'C:\Users\admin\Desktop\RansomwareDetection\Dataset\data_file.csv\data_file.csv')

# # Inspect the dataset
# print(df.head())
# print(df.info())

# Separate features and labels
X = df.drop('Benign', axis=1)  # Assuming 'Benign' is the label column
y = df['Benign']

# Identify non-numeric columns
non_numeric_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(exclude=['object']).columns

# Preprocessing for numeric data
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, non_numeric_cols)])

# Apply preprocessing to the dataset
X_preprocessed = preprocessor.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

print('The shape of X_train is {0} and shape of y_train is {1}'.format(X_train.shape, y_train.shape))

# Initialize XGBoost classifier
model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=100,  # Number of boosting rounds
    max_depth=5,       # Maximum depth of a tree
    learning_rate=0.1, # Step size shrinkage
    subsample=0.8,     # Subsample ratio of the training instances
    colsample_bytree=0.8 # Subsample ratio of columns when constructing each tree
)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

# Save the model
joblib.dump(model, 'xgboost_ransomware_model.pkl')

# Load the model
model = joblib.load('xgboost_ransomware_model.pkl')


The shape of X_train is (49988, 124985) and shape of y_train is (49988,)
Confusion Matrix:
[[7102   23]
 [  37 5335]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      7125
           1       1.00      0.99      0.99      5372

    accuracy                           1.00     12497
   macro avg       1.00      0.99      1.00     12497
weighted avg       1.00      1.00      1.00     12497


Accuracy Score:
0.9951988477234537
