In [None]:
# Step 1: Install Hugging Face Hub if not already installed
!pip install -q huggingface_hub

# Step 2: Log in using your Hugging Face token
from huggingface_hub import login
login(token="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")

# Step 3: Import pandas
import pandas as pd

# Step 4: Read the JSON file from the dataset
df = pd.read_json("hf://datasets/sunlab/patch_db/patch_db.json")

# Step 5: Display first few rows
print(df.head())


  CVE_ID CWE_ID      category                                 commit_id  \
0     NA     NA  non-security  540958e2f5a87b81aa5f55ce40b3e2869754f97d   
1     NA     NA  non-security  64d240b721b21e266ffde645ec965c3b6d1c551f   
2     NA     NA  non-security  f181dd278274f50e689ebd13237010a90b430164   
3     NA     NA  non-security  0abdc3723b5d33dde698ab941325edec2819c128   
4     NA     NA  non-security  d7930d7f820e5dd6b07b823f155aeb943b525e16   

                                      commit_message  \
0  commit 540958e2f5a87b81aa5f55ce40b3e2869754f97...   
1  commit 64d240b721b21e266ffde645ec965c3b6d1c551...   
2  commit f181dd278274f50e689ebd13237010a90b43016...   
3  commit 0abdc3723b5d33dde698ab941325edec2819c12...   
4  commit d7930d7f820e5dd6b07b823f155aeb943b525e1...   

                                           diff_code       owner        repo  \
0  diff --git a/drivers/staging/comedi/drivers/cb...  stoth68000  media-tree   
1  diff --git a/drivers/target/target_core_file.c...

In [None]:
# Combine commit_message + diff_code as text input
df["text"] = df["commit_message"].fillna('') + " " + df["diff_code"].fillna('')

# Target variable: 'category' (security or non-security)
X = df["text"]
y = df["category"]

# Split dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# =========================================================
# STEP 4: Text Vectorization + ML Model Training
# =========================================================
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Create a pipeline
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
    ('rf', RandomForestClassifier(n_estimators=200, random_state=42))
])

# Train
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("\n📊 Model Evaluation:")
print(classification_report(y_test, y_pred))

# Save model
import joblib
joblib.dump(model, "patch_classifier.pkl")

# =========================================================
# STEP 5: Blockchain Simulation for Patch Verification
# =========================================================
# =========================================================
# STEP 5: Blockchain Simulation for Patch Verification
# =========================================================
!pip install pycryptodome  # <--- Add this line once

import hashlib, json, time
from Crypto.PublicKey import RSA
from Crypto.Signature import pkcs1_15
from Crypto.Hash import SHA256

import hashlib, json, time
from Crypto.PublicKey import RSA
from Crypto.Signature import pkcs1_15
from Crypto.Hash import SHA256

# Generate key pair for digital signing (simulated vendor)
key = RSA.generate(2048)
private_key = key
public_key = key.publickey()

class Block:
    def __init__(self, index, patch_id, patch_data, prev_hash):
        self.index = index
        self.timestamp = time.time()
        self.patch_id = patch_id
        self.patch_data = patch_data
        self.prev_hash = prev_hash
        self.hash = self.compute_hash()
        self.signature = self.sign_block()

    def compute_hash(self):
        block_string = json.dumps({
            "index": self.index,
            "timestamp": self.timestamp,
            "patch_id": self.patch_id,
            "patch_data": self.patch_data,
            "prev_hash": self.prev_hash
        }, sort_keys=True).encode()
        return hashlib.sha256(block_string).hexdigest()

    def sign_block(self):
        h = SHA256.new(self.hash.encode())
        signature = pkcs1_15.new(private_key).sign(h)
        return signature.hex()

    def verify_signature(self):
        h = SHA256.new(self.hash.encode())
        try:
            pkcs1_15.new(public_key).verify(h, bytes.fromhex(self.signature))
            return True
        except (ValueError, TypeError):
            return False

# Create genesis block
blockchain = [Block(0, "GENESIS", "Initial Block", "0")]

# Function to add verified patch
def add_patch_to_chain(patch_id, patch_info):
    prev_hash = blockchain[-1].hash
    block = Block(len(blockchain), patch_id, patch_info, prev_hash)
    if block.verify_signature():
        blockchain.append(block)
        print(f"✅ Patch {patch_id} added to blockchain.")
    else:
        print(f"❌ Patch {patch_id} verification failed!")

# =========================================================
# STEP 6: Add ML-verified patches to Blockchain
# =========================================================
# Example: classify 3 patches from test data
sample_patches = df.sample(3, random_state=42)

for _, row in sample_patches.iterrows():
    patch_text = row["commit_message"] + " " + row["diff_code"]
    predicted_cat = model.predict([patch_text])[0]
    patch_info = {
        "CVE_ID": row["CVE_ID"],
        "CWE_ID": row["CWE_ID"],
        "category": predicted_cat,
        "repo": row["repo"],
        "commit_id": row["commit_id"]
    }

    add_patch_to_chain(row["commit_id"], patch_info)

# =========================================================
# STEP 7: Display Blockchain Records
# =========================================================
print("\n📘 Blockchain Ledger:")
for block in blockchain:
    print(f"Block {block.index} | Patch ID: {block.patch_id} | Category: {block.patch_data if block.index==0 else block.patch_data['category']} | Hash: {block.hash[:10]}...")


📊 Model Evaluation:
              precision    recall  f1-score   support

non-security       0.88      0.99      0.93      4748
    security       0.97      0.73      0.83      2415

    accuracy                           0.90      7163
   macro avg       0.92      0.86      0.88      7163
weighted avg       0.91      0.90      0.90      7163

Collecting pycryptodome
  Downloading pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycryptodome
Successfully installed pycryptodome-3.23.0
✅ Patch 3927c3aa28ea2ed8dbb5228de3e69928972da412 added to blockchain.
✅ Patch 9842df62004f366b9fed2423e24df10542ee0dc5 added to blockchain.
✅ Patch 2475f1a83ccf313d828b25f1769e3a37442ecf64 added to blockchain.



In [8]:
# =========================================================
# PATCH CLASSIFICATION + BLOCKCHAIN STORAGE SYSTEM
# =========================================================

# Install dependencies
!pip install pycryptodome -q

# ---------------- IMPORTS ----------------
import pandas as pd
import numpy as np
import hashlib, json, time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from Crypto.PublicKey import RSA
from Crypto.Signature import pkcs1_15
from Crypto.Hash import SHA256

# ---------------- LOAD DATA ----------------
# Make sure you're logged in to Hugging Face if this dataset is gated
from huggingface_hub import login
login(token="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")  # ⚠️ Replace with your HF token safely

# Load the PatchDB dataset
df = pd.read_json("hf://datasets/sunlab/patch_db/patch_db.json")

# Combine text fields for model input
df["text"] = df["commit_message"] + " " + df["diff_code"]

# ---------------- TRAIN-TEST SPLIT ----------------
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["category"], test_size=0.2, random_state=42
)

# ---------------- TF-IDF + MODEL ----------------
tfidf = TfidfVectorizer(max_features=5000)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

model = LogisticRegression(max_iter=200)
model.fit(X_train_vec, y_train)

# Evaluate the model
y_pred = model.predict(X_test_vec)
print("\n📊 MODEL EVALUATION:\n")
print(classification_report(y_test, y_pred))

# =========================================================
# BLOCKCHAIN IMPLEMENTATION
# =========================================================
class Block:
    def __init__(self, index, data, previous_hash):
        self.index = index
        self.timestamp = time.time()
        self.data = data
        self.previous_hash = previous_hash
        self.hash = self.compute_hash()

    def compute_hash(self):
        block_string = json.dumps({
            'index': self.index,
            'timestamp': self.timestamp,
            'data': self.data,
            'previous_hash': self.previous_hash
        }, sort_keys=True).encode()
        return hashlib.sha256(block_string).hexdigest()

class Blockchain:
    def __init__(self):
        self.chain = []
        self.create_genesis_block()

    def create_genesis_block(self):
        genesis_block = Block(0, {"message": "Genesis Block"}, "0")
        self.chain.append(genesis_block)

    def add_block(self, data):
        previous_hash = self.chain[-1].hash
        block = Block(len(self.chain), data, previous_hash)
        self.chain.append(block)
        return block

# Initialize blockchain
blockchain = Blockchain()

# =========================================================
# ML PREDICTION + BLOCKCHAIN STORAGE
# =========================================================
def predict_patch_and_save(patch_message, diff_code):
    X_input = tfidf.transform([patch_message + " " + diff_code])
    prediction = model.predict(X_input)[0]

    if prediction == "non-security":
        patch_data = {
            "patch_message": patch_message,
            "diff_code": diff_code,
            "status": "SAFE (Non-Security Patch)"
        }
        block = blockchain.add_block(patch_data)

        print("\n✅ This patch is SAFE and has been stored in the blockchain.\n")
        print("Block Details:")
        print(json.dumps({
            "index": block.index,
            "timestamp": block.timestamp,
            "data": block.data,
            "hash": block.hash,
            "previous_hash": block.previous_hash
        }, indent=4))
    else:
        print("\n⚠️ This patch is a SECURITY patch — not added to blockchain (requires review).")

# =========================================================
# RUNTIME INPUT SECTION
# =========================================================
print("\n🔹 Enter Patch Details Below:")
patch_message = input("Enter the patch commit message: ")
diff_code = input("Enter the patch diff code: ")

predict_patch_and_save(patch_message, diff_code)



📊 MODEL EVALUATION:

              precision    recall  f1-score   support

non-security       0.84      0.96      0.90      4777
    security       0.89      0.63      0.74      2386

    accuracy                           0.85      7163
   macro avg       0.86      0.80      0.82      7163
weighted avg       0.86      0.85      0.84      7163


🔹 Enter Patch Details Below:
Enter the patch diff code: Fix SQL injection vulnerability in login form input validation

⚠️ This patch is a SECURITY patch — not added to blockchain (requires review).
