# 🛠️ OS-Based Ransomware Model (Local Compatible Version)
This notebook retrains the OS-level detection model using your current environment's `scikit-learn` version to ensure compatibility.

In [1]:
# Load Data
import pandas as pd

api = pd.read_csv("API_Test.csv")
labels = pd.read_csv("labels.csv")

assert len(api) == len(labels), "Mismatch between API and label rows"

print(f"Loaded {len(api)} API sequences and labels")

✅ Loaded 7106 API sequences and labels


In [2]:
# Convert API sequences to TF-IDF and map labels
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(api.iloc[:, 0])  # Assuming 1st column = API calls

# Convert 'Benign' to 0, everything else to 1
y = labels.iloc[:, 0].str.lower().map(lambda x: 0 if "benign" in x else 1)

print(f" TF-IDF shape: {X.shape}, Labels: {y.nunique()} classes")

✅ TF-IDF shape: (7106, 278), Labels: 1 classes


In [3]:
# Split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
print(f" Training set: {X_train.shape}, Test set: {X_test.shape}")

✅ Training set: (5684, 278), Test set: (1422, 278)


In [4]:
# ✅ Train the Random Forest model
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print("Model trained")

✅ Model trained


In [5]:
# ✅ Evaluate the model
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

🎯 Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      1422

    accuracy                           1.00      1422
   macro avg       1.00      1.00      1.00      1422
weighted avg       1.00      1.00      1.00      1422



In [8]:
# Save model and vectorizer for hybrid detection
import joblib

joblib.dump(model, "ransomware_model_OS_BASED.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
print("Saved: ransomware_model_OS_BASED.pkl & tfidf_vectorizer.pkl")

✅ Saved: ransomware_model_OS_BASED.pkl & tfidf_vectorizer.pkl


In [9]:
import sklearn, joblib
print("scikit-learn:", sklearn.__version__)
print("joblib:", joblib.__version__)


scikit-learn: 1.6.1
joblib: 1.4.2


In [10]:
import joblib

# Load models
os_model = joblib.load("ransomware_model_OS_BASED.pkl")
tfidf = joblib.load("tfidf_vectorizer.pkl")

print("Model and vectorizer loaded successfully!")


✅ Model and vectorizer loaded successfully!


In [13]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [14]:
# Step 2: Load Dataset
df = pd.read_csv('NF-ToN-IoT.csv', low_memory=False)

print(f"Loaded dataset with shape: {df.shape}")
df = df.dropna()

Loaded dataset with shape: (1379274, 14)


In [15]:
# Step 3: Feature Selection
features = [
    'L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'IN_BYTES', 'OUT_BYTES',
    'IN_PKTS', 'OUT_PKTS', 'TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS'
]
df = df[features + ['Label']]
df = df[df['Label'].isin([0, 1])]  # Ensure binary classification
print(df['Label'].value_counts())

Label
1    1108995
0     270279
Name: count, dtype: int64


In [16]:
#  Step 4: Scale Features
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])
X = df[features]
y = df['Label']

In [29]:
"Model 2: Isolation Forest (Unsupervised)"

'Model 2: Isolation Forest (Unsupervised)'

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print(" Random Forest Classification Report:\n")
print(classification_report(y_test, y_pred_rf))

🎯 Random Forest Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     81084
           1       1.00      1.00      1.00    332699

    accuracy                           1.00    413783
   macro avg       1.00      1.00      1.00    413783
weighted avg       1.00      1.00      1.00    413783



In [28]:
"Model 2: Isolation Forest (Unsupervised)"

'Model 2: Isolation Forest (Unsupervised)'

In [19]:
X_train_unsup = X[y == 0]  # Only normal traffic
iso_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
iso_forest.fit(X_train_unsup)
y_pred_iso = iso_forest.predict(X)
y_pred_iso = np.where(y_pred_iso == -1, 1, 0)
print("🔍 Isolation Forest Classification Report:\n")
print(classification_report(y, y_pred_iso))

🔍 Isolation Forest Classification Report:

              precision    recall  f1-score   support

           0       0.20      0.95      0.33    270279
           1       0.88      0.09      0.16   1108995

    accuracy                           0.26   1379274
   macro avg       0.54      0.52      0.25   1379274
weighted avg       0.75      0.26      0.20   1379274



In [20]:
joblib.dump(rf, 'network_rf_model.pkl')
joblib.dump(iso_forest, 'isolation_forest_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("Models and scaler saved.")

✅ Models and scaler saved.


In [21]:
net_rf = joblib.load('network_rf_model.pkl')
iso_forest = joblib.load('isolation_forest_model.pkl')
scaler = joblib.load('scaler.pkl')

In [22]:
# Example API call sequence (OS-level)
api_sequence = "ldrloaddll ldrgetprocedureaddress regopenkeyexa ntopenkey ntqueryvaluekey ntclose"

# Example network flow features (same order as training):
# [L4_SRC_PORT, L4_DST_PORT, PROTOCOL, IN_BYTES, OUT_BYTES, IN_PKTS, OUT_PKTS, TCP_FLAGS, FLOW_DURATION_MILLISECONDS]
net_features = [123, 80, 6, 1500, 1200, 50, 40, 2, 300]

In [23]:
# Vectorize the API sequence
api_vector = tfidf.transform([api_sequence])
os_pred = os_model.predict(api_vector)[0]

# Scale and predict for network flow
net_scaled = scaler.transform([net_features])
net_rf_pred = net_rf.predict(net_scaled)[0]
iso_pred = iso_forest.predict(net_scaled)[0]
iso_anomaly = 1 if iso_pred == -1 else 0

In [25]:
# Combine the verdict
if os_pred == 1 or net_rf_pred == 1 or iso_anomaly == 1:
    verdict = "⚠️ THREAT DETECTED"
else:
    verdict = "✅ Clean"

print("OS Model Prediction       :", os_pred)
print("Network RF Prediction     :", net_rf_pred)
print("Isolation Forest Anomaly  :", iso_anomaly)
print("🔒 Final Verdict           :", verdict)

OS Model Prediction       : 1
Network RF Prediction     : 0
Isolation Forest Anomaly  : 1
🔒 Final Verdict           : ⚠️ THREAT DETECTED
