In [3]:
import pandas as pd  

# Load dataset (update the path if needed)
df = pd.read_csv("BitcoinHeistData.csv")  

# Display the first few rows
print(df.head())

# Check dataset info
print(df.info())


                              address  year  day  length    weight  count  \
0   111K8kZAEnJg245r2cM6y9zgJGHZtJPy6  2017   11      18  0.008333      1   
1  1123pJv8jzeFQaCV4w644pzQJzVWay2zcA  2016  132      44  0.000244      1   
2  112536im7hy6wtKbpH1qYDWtTyMRAcA2p7  2016  246       0  1.000000      1   
3  1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7  2016  322      72  0.003906      1   
4  1129TSjKtx65E35GiUo4AYVeyo48twbrGX  2016  238     144  0.072848    456   

   looped  neighbors       income            label  
0       0          2  100050000.0  princetonCerber  
1       0          1  100000000.0   princetonLocky  
2       0          2  200000000.0  princetonCerber  
3       0          2   71200000.0  princetonCerber  
4       0          1  200000000.0   princetonLocky  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2916697 entries, 0 to 2916696
Data columns (total 10 columns):
 #   Column     Dtype  
---  ------     -----  
 0   address    object 
 1   year       int64  
 2   day  

In [4]:
# Check for missing values
print(df.isnull().sum())

# Check for duplicate records
print(f"Total Duplicates: {df.duplicated().sum()}")

# Check unique ransomware labels
print(df['label'].value_counts())


address      0
year         0
day          0
length       0
weight       0
count        0
looped       0
neighbors    0
income       0
label        0
dtype: int64
Total Duplicates: 0
label
white                          2875284
paduaCryptoWall                  12390
montrealCryptoLocker              9315
princetonCerber                   9223
princetonLocky                    6625
montrealCryptXXX                  2419
montrealNoobCrypt                  483
montrealDMALockerv3                354
montrealDMALocker                  251
montrealSamSam                      62
montrealCryptoTorLocker2015         55
montrealGlobeImposter               55
montrealGlobev3                     34
montrealGlobe                       32
montrealWannaCry                    28
montrealRazy                        13
montrealAPT                         11
paduaKeRanger                       10
montrealFlyper                       9
montrealXTPLocker                    8
montrealVenusLocker            

In [5]:
import numpy as np

# Convert 'label' to binary classification
df['label'] = np.where(df['label'] == 'white', 0, 1)

# Drop 'address' column (not needed for ML)
df = df.drop(columns=['address'])

# Display updated dataset info
print(df.info())

# Check label distribution
print(df['label'].value_counts())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2916697 entries, 0 to 2916696
Data columns (total 9 columns):
 #   Column     Dtype  
---  ------     -----  
 0   year       int64  
 1   day        int64  
 2   length     int64  
 3   weight     float64
 4   count      int64  
 5   looped     int64  
 6   neighbors  int64  
 7   income     float64
 8   label      int64  
dtypes: float64(2), int64(7)
memory usage: 200.3 MB
None
label
0    2875284
1      41413
Name: count, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Splitting features and target variable
X = df.drop(columns=['label'])  # Features
y = df['label']  # Target

# Split into train and test sets (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features (important for ML)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Check shape
print("Train set size:", X_train_scaled.shape)
print("Test set size:", X_test_scaled.shape)

Train set size: (2333357, 8)
Test set size: (583340, 8)


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")

# Train the model
rf_model.fit(X_train_scaled, y_train)

# Predictions
y_pred = rf_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("🔹 Model Accuracy:", accuracy)
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))


🔹 Model Accuracy: 0.9882195632049919

🔹 Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    575057
           1       0.71      0.29      0.41      8283

    accuracy                           0.99    583340
   macro avg       0.85      0.64      0.70    583340
weighted avg       0.99      0.99      0.99    583340



In [8]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

# Train Random Forest again
rf_model_balanced = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42, class_weight="balanced")
rf_model_balanced.fit(X_train_balanced, y_train_balanced)

# Predict on test set
y_pred_balanced = rf_model_balanced.predict(X_test_scaled)

# Evaluate model
from sklearn.metrics import accuracy_score, classification_report
print("🔹 Balanced Model Accuracy:", accuracy_score(y_test, y_pred_balanced))
print("\n🔹 Balanced Classification Report:\n", classification_report(y_test, y_pred_balanced))


🔹 Balanced Model Accuracy: 0.9152432543628073

🔹 Balanced Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.92      0.96    575057
           1       0.12      0.76      0.20      8283

    accuracy                           0.92    583340
   macro avg       0.56      0.84      0.58    583340
weighted avg       0.98      0.92      0.94    583340



In [9]:
from xgboost import XGBClassifier

# Train XGBoost
xgb_model = XGBClassifier(n_estimators=500, max_depth=15, learning_rate=0.05, scale_pos_weight=10, random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# Predict on test set
y_pred_xgb = xgb_model.predict(X_test_scaled)

# Evaluate model
print("🔹 XGBoost Model Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\n🔹 XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))


🔹 XGBoost Model Accuracy: 0.9826396269756917

🔹 XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99    575057
           1       0.42      0.58      0.49      8283

    accuracy                           0.98    583340
   macro avg       0.71      0.78      0.74    583340
weighted avg       0.99      0.98      0.98    583340



In [10]:
xgb_model = XGBClassifier(
    n_estimators=700, 
    max_depth=20, 
    learning_rate=0.03, 
    scale_pos_weight=15, 
    subsample=0.8, 
    colsample_bytree=0.8, 
    random_state=42
)

xgb_model.fit(X_train_scaled, y_train)

y_pred_xgb = xgb_model.predict(X_test_scaled)

print("🔹 Final XGBoost Model Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\n🔹 Final XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))


🔹 Final XGBoost Model Accuracy: 0.9854133095621764

🔹 Final XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99    575057
           1       0.49      0.47      0.48      8283

    accuracy                           0.99    583340
   macro avg       0.74      0.73      0.74    583340
weighted avg       0.99      0.99      0.99    583340



In [None]:
# Load the model
import joblib

# Save the trained model
joblib.dump(xgb_model, 'bitcoinHeist.pkl')

print("✅ Model saved successfully as ransomware_model.pkl")

✅ Model saved successfully as ransomware_model.pkl
