# Improved Malware Detection Model (v2)

**Improvements over v1 (ember_malware_detection_clean.ipynb):**
1. **Full training data** - Uses all 800k labeled samples instead of 300k
2. **No PCA** - Keeps all 2,381 features (PCA was losing detection-critical info)
3. **Better hyperparameters** - More trees, regularization, early stopping
4. **LightGBM option** - Faster training, often better performance

**Expected Results:**
- EMBER test accuracy: 95-98%
- Training time: ~45-60 min on CPU, ~15 min on GPU

In [None]:
# Install dependencies
!pip install joblib numpy pandas matplotlib seaborn scikit-learn xgboost lightgbm lief git+https://github.com/elastic/ember.git

In [None]:
import os
import shutil
import time
import json
import joblib
from pathlib import Path

import ember
import ember.features
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import FeatureHasher
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score, roc_curve, classification_report
)
import xgboost as xgb
import lightgbm as lgb

# Monkey patch for ember compatibility
def fixed_section_info_process_raw_features(self, raw_obj):
    sections = raw_obj['sections']
    general = [
        len(sections),
        sum(1 for s in sections if s['size'] == 0),
        sum(1 for s in sections if s['name'] == ""),
        sum(1 for s in sections if 'MEM_READ' in s['props'] and 'MEM_EXECUTE' in s['props']),
        sum(1 for s in sections if 'MEM_WRITE' in s['props'])
    ]
    section_sizes = [(s['name'], s['size']) for s in sections]
    section_sizes_hashed = FeatureHasher(50, input_type="pair").transform([section_sizes]).toarray()[0]
    section_entropy = [(s['name'], s['entropy']) for s in sections]
    section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0]
    section_vsize = [(s['name'], s['vsize']) for s in sections]
    section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0]
    entry_name_hashed = FeatureHasher(50, input_type="string").transform([[raw_obj['entry']]]).toarray()[0]
    characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']]
    characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0]
    return np.hstack([general, section_sizes_hashed, section_entropy_hashed, section_vsize_hashed, entry_name_hashed, characteristics_hashed]).astype(np.float32)

if hasattr(ember.features, 'SectionInfo'):
    ember.features.SectionInfo.process_raw_features = fixed_section_info_process_raw_features
    print("Monkey patch applied.")

# Config
DATA_DIR = Path("./ember_data")
RESULTS_DIR = Path("./results_improved")
RESULTS_DIR.mkdir(exist_ok=True, parents=True)
RANDOM_SEED = 42

sns.set_style("whitegrid")
print("Setup complete.")

In [None]:
# Download and setup EMBER dataset (same as v1)
def setup_ember_dataset():
    if not DATA_DIR.exists():
        DATA_DIR.mkdir(parents=True)
    
    if not (DATA_DIR / "train_features.jsonl").exists() and not (DATA_DIR / "y_train.dat").exists():
        print("Downloading EMBER 2018 dataset...")
        if not os.path.exists("ember_dataset_2018_2.tar.bz2"):
            !wget https://ember.elastic.co/ember_dataset_2018_2.tar.bz2
        print("Extracting...")
        !tar -xvf ember_dataset_2018_2.tar.bz2
        source_dir = Path("ember2018")
        if source_dir.exists():
            for file_path in source_dir.iterdir():
                shutil.move(str(file_path), str(DATA_DIR / file_path.name))
            shutil.rmtree(str(source_dir))
        if Path("ember_dataset_2018_2.tar.bz2").exists():
            os.remove("ember_dataset_2018_2.tar.bz2")
    else:
        print("Dataset present.")
    
    required_files = ["X_train.dat", "y_train.dat", "X_test.dat", "y_test.dat"]
    if not all((DATA_DIR / f).exists() for f in required_files):
        print("Vectorizing features...")
        ember.create_vectorized_features(str(DATA_DIR))
    else:
        print("Vectorized features exist.")

setup_ember_dataset()

In [None]:
# Load ALL training data (no subsampling!)
print("Loading EMBER data (full dataset)...")

X_train, y_train = ember.read_vectorized_features(str(DATA_DIR), subset="train")
X_test, y_test = ember.read_vectorized_features(str(DATA_DIR), subset="test")

# Filter unlabeled (y != -1)
train_mask = y_train != -1
test_mask = y_test != -1

X_train = X_train[train_mask]
y_train = y_train[train_mask]
X_test = X_test[test_mask]
y_test = y_test[test_mask]

# Handle NaNs
X_train = np.nan_to_num(X_train, nan=0.0)
X_test = np.nan_to_num(X_test, nan=0.0)

print(f"Training samples: {len(y_train):,}")
print(f"Test samples: {len(y_test):,}")
print(f"Features: {X_train.shape[1]:,}")
print(f"Class distribution: {dict(zip(*np.unique(y_train, return_counts=True)))}")

In [None]:
# Scale features (NO PCA - keep all features!)
print("Scaling features (keeping all 2,381 features)...")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# NO PCA! This is the key improvement
X_train_final = X_train_scaled
X_test_final = X_test_scaled

print(f"Final training shape: {X_train_final.shape}")
print(f"Final test shape: {X_test_final.shape}")

## Model Training Options

Choose ONE of the following cells to run:
- **Option A: XGBoost** - More accurate, slower
- **Option B: LightGBM** - Faster, similar accuracy

In [None]:
# OPTION A: XGBoost with improved hyperparameters
print("Training XGBoost (improved params)...")

# Split some training data for early stopping validation
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_final, y_train, test_size=0.1, random_state=RANDOM_SEED, stratify=y_train
)

model = xgb.XGBClassifier(
    n_estimators=1500,           # More trees
    max_depth=12,                # Slightly less depth (reduce overfit)
    learning_rate=0.03,          # Lower LR with more trees
    subsample=0.8,               # Row sampling
    colsample_bytree=0.8,        # Feature sampling
    min_child_weight=5,          # Regularization
    gamma=0.1,                   # Min loss reduction
    reg_alpha=0.1,               # L1 regularization
    reg_lambda=1.0,              # L2 regularization
    objective='binary:logistic',
    eval_metric='logloss',
    early_stopping_rounds=50,    # Stop if no improvement
    random_state=RANDOM_SEED,
    n_jobs=-1,
    tree_method='hist'           # Use 'gpu_hist' if GPU available
)

start_time = time.time()
model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    verbose=100
)
train_time = time.time() - start_time
print(f"\nTraining completed in {train_time/60:.1f} minutes.")
print(f"Best iteration: {model.best_iteration}")

In [None]:
# OPTION B: LightGBM (faster alternative)
print("Training LightGBM...")

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_final, y_train, test_size=0.1, random_state=RANDOM_SEED, stratify=y_train
)

model = lgb.LGBMClassifier(
    n_estimators=2000,
    max_depth=15,
    learning_rate=0.03,
    num_leaves=256,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_samples=20,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=RANDOM_SEED,
    n_jobs=-1,
    verbose=-1
)

start_time = time.time()
model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)
train_time = time.time() - start_time
print(f"\nTraining completed in {train_time/60:.1f} minutes.")

In [None]:
# Evaluation on EMBER test set
print("Evaluating on EMBER test set...")

y_pred = model.predict(X_test_final)
y_pred_proba = model.predict_proba(X_test_final)[:, 1]

# Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

print("\n" + "="*50)
print("EMBER TEST SET RESULTS")
print("="*50)
print(f"Accuracy:  {acc:.4f} ({acc*100:.2f}%)")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"ROC-AUC:   {auc:.4f}")
print("="*50)

# Detailed report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Benign', 'Malicious']))

In [None]:
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Benign', 'Malicious'], yticklabels=['Benign', 'Malicious'])
axes[0].set_title(f'Confusion Matrix (Accuracy: {acc*100:.1f}%)')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
axes[1].plot(fpr, tpr, 'b-', linewidth=2, label=f'ROC (AUC = {auc:.3f})')
axes[1].plot([0, 1], [0, 1], 'k--', linewidth=1)
axes[1].fill_between(fpr, tpr, alpha=0.3)
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('ROC Curve')
axes[1].legend(loc='lower right')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'evaluation_plots_improved.png', dpi=150)
plt.show()

In [None]:
# Save improved model (to separate directory)
print("Saving model to ./results_improved/")

# Determine model type
model_type = "XGBoost" if isinstance(model, xgb.XGBClassifier) else "LightGBM"

joblib.dump(model, RESULTS_DIR / 'model_improved.pkl')
joblib.dump(scaler, RESULTS_DIR / 'scaler_improved.pkl')

# Save config
config = {
    "model_type": model_type,
    "feature_set": "EMBER 2018",
    "preprocessing": "StandardScaler (NO PCA)",
    "n_features": X_train.shape[1],
    "training_samples": len(y_train),
    "test_accuracy": float(acc),
    "test_precision": float(prec),
    "test_recall": float(rec),
    "test_f1": float(f1),
    "test_auc": float(auc),
    "training_time_minutes": train_time / 60,
    "improvements_over_v1": [
        "Full training data (800k vs 300k)",
        "No PCA (all 2381 features)",
        "Better hyperparameters",
        "Early stopping"
    ]
}

with open(RESULTS_DIR / "experiment_params_improved.json", "w") as f:
    json.dump(config, f, indent=4)

print(f"\nModel saved! Results in {RESULTS_DIR}/")
print(f"\nTo use this model in the backend, update main.py to load from 'results_improved/'")

## Comparison: v1 vs v2

| Aspect | v1 (clean notebook) | v2 (improved) |
|--------|---------------------|---------------|
| Training samples | 300,000 | 800,000 |
| Features used | 1,312 (PCA) | 2,381 (all) |
| Expected accuracy | ~93% | ~96-98% |
| Training time | ~32 min | ~45-60 min |
| Model size | ~40 MB | ~60-80 MB |