In [None]:
# ✅ STEP 0: Install libraries
!pip install transformers accelerate scikit-learn tqdm matplotlib --quiet

# ✅ STEP 1: Imports
import os, glob, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
import torch.nn as nn
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import RobertaTokenizer, RobertaModel, pipeline

# ✅ STEP 2: Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("🖥️ Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base").to(device).eval()
llm = pipeline("text-generation", model="tiiuae/falcon-rw-1b", device=-1)

# ✅ STEP 3: Load code
def load_java_files(base_path, max_files=100):
    files = glob.glob(base_path + "/**/*.java", recursive=True)
    code, authors = [], []
    for path in tqdm(files[:max_files], desc="📂 Loading .java files"):
        try:
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                content = f.read().strip()
                if len(content) > 50:
                    code.append(content)
                    authors.append(os.path.basename(os.path.dirname(path)))
        except: continue
    return code, authors

# ✅ STEP 4: Obfuscation
def obfuscate_java_code(snippets):
    obf = []
    for code in tqdm(snippets, desc="🤖 Obfuscating"):
        prompt = f"You are a code obfuscator. Rename vars, reorder blocks, add dead code. Keep logic.\nJava Code:\n{code[:400]}\nObfuscated Java Code:"
        try:
            result = llm(prompt, max_new_tokens=100, do_sample=True, temperature=0.9)[0]["generated_text"]
            obfuscated = result.split("Obfuscated Java Code:")[-1].strip()
        except: obfuscated = code
        obf.append(obfuscated)
    return obf

# ✅ STEP 5: Embedding
@torch.no_grad()
def get_embeddings(snippets, batch_size=4, max_len=256):
    embs = []
    for i in tqdm(range(0, len(snippets), batch_size), desc="🔗 Embedding"):
        batch = snippets[i:i+batch_size]
        try:
            tokens = tokenizer(batch, return_tensors="pt", padding="max_length", truncation=True, max_length=max_len).to(device)
            output = model(**tokens).last_hidden_state.mean(dim=1).cpu().numpy()
            embs.extend(output)
        except: continue
    return np.array(embs, dtype=np.float32)

# ✅ STEP 6: VAE Class
class BetaVAE(nn.Module):
    def __init__(self, input_dim=768, latent_dim=64, beta=4.0):
        super().__init__()
        self.beta = beta
        self.encoder = nn.Sequential(nn.Linear(input_dim, 256), nn.ReLU(), nn.Linear(256, 128), nn.ReLU())
        self.mu = nn.Linear(128, latent_dim)
        self.logvar = nn.Linear(128, latent_dim)
        self.decoder = nn.Sequential(nn.Linear(latent_dim, 128), nn.ReLU(), nn.Linear(128, 256), nn.ReLU(), nn.Linear(256, input_dim))

    def reparam(self, mu, logvar):
        return mu + torch.randn_like(logvar) * torch.exp(0.5 * logvar)

    def forward(self, x):
        enc = self.encoder(x)
        mu, logvar = self.mu(enc), self.logvar(enc)
        z = self.reparam(mu, logvar)
        return self.decoder(z), mu, logvar

    def loss_fn(self, x, recon, mu, logvar):
        mse = nn.functional.mse_loss(recon, x, reduction='sum')
        kl = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        return mse + self.beta * kl

# ✅ STEP 7: Train VAE
def train_vae(X, beta=4.0, epochs=10):
    vae = BetaVAE(beta=beta).to(device)
    opt = torch.optim.Adam(vae.parameters(), lr=1e-3)
    loader = torch.utils.data.DataLoader(torch.tensor(X, dtype=torch.float32), batch_size=16, shuffle=True)
    for epoch in range(epochs):
        total = 0
        for batch in loader:
            batch = batch.to(device)
            recon, mu, logvar = vae(batch)
            loss = vae.loss_fn(batch, recon, mu, logvar)
            opt.zero_grad(); loss.backward(); opt.step()
            total += loss.item()
        print(f"📘 Epoch {epoch+1}: Loss = {total:.2f}")
    return vae

# ✅ STEP 8: Feature Extraction
@torch.no_grad()
def extract_features(vae, X):
    X = torch.tensor(X, dtype=torch.float32).to(device)
    recon, mu, logvar = vae(X)
    err = ((X - recon)**2).mean(dim=1).cpu().numpy()
    return np.hstack([mu.cpu().numpy(), err.reshape(-1,1)]), err

# ✅ STEP 9: Evaluate
def evaluate_all(features, labels, errors, authors, embeddings):
    # 🔍 Anomaly Detection
    fpr, tpr, thresholds = roc_curve(labels, errors)
    best_idx = np.argmax(tpr - fpr)
    threshold = thresholds[best_idx]
    pred = (errors > threshold).astype(int)

    print(f"\n📈 Optimal Anomaly Threshold: {threshold:.4f} (Youden)")
    print("📊 Anomaly Detection Report:\n", classification_report(labels, pred))
    print("📉 Confusion Matrix:\n", confusion_matrix(labels, pred))

    auc_score = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"AUC = {auc_score:.2f}", color='blue')
    plt.plot([0,1],[0,1],'--',color='gray')
    plt.title("ROC Curve"); plt.xlabel("FPR"); plt.ylabel("TPR")
    plt.grid(True); plt.legend(); plt.show()

    # 🧠 Authorship Attribution
    le = LabelEncoder()
    y = le.fit_transform(authors)
    X_train, X_test, y_train, y_test = train_test_split(embeddings, y, stratify=y, test_size=0.2)
    clf = RandomForestClassifier(n_estimators=300).fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print(f"\n🧠 Author Attribution Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print("📊 Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
    print("📉 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ✅ STEP 10: Run Pipeline
!unzip -q /content/GPT-Java-GCJ-Dataset-main.zip -d /content/java_data

code, authors = load_java_files("/content/java_data/GPT-Java-GCJ-Dataset-main", max_files=100)
obf_code = obfuscate_java_code(code)

X_clean = get_embeddings(code)
X_obf = get_embeddings(obf_code)

# Dual-VAE Setup
beta_vae = train_vae(X_clean, beta=4.0, epochs=10)
features, errors = extract_features(beta_vae, np.vstack([X_clean, X_obf]))
labels = np.array([0]*len(X_clean) + [1]*len(X_obf))

evaluate_all(features, labels, errors, authors, X_clean)


In [None]:
# ✅ STEP 0: Install libraries
!pip install transformers accelerate scikit-learn tqdm matplotlib pandas --quiet

# ✅ STEP 1: Imports
import os, time, glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
import torch.nn as nn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from transformers import RobertaTokenizer, RobertaModel, pipeline
from google.colab import drive
drive.mount('/content/drive')

# ✅ STEP 2: Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

# ✅ STEP 3: Load models
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
codebert = RobertaModel.from_pretrained("microsoft/codebert-base").to(device).eval()
llm = pipeline("text-generation", model="tiiuae/falcon-rw-1b", device=-1)

# ✅ STEP 4: Load multilingual code from GCJ-style CSVs
def load_code_from_csvs(folder_path, max_per_file=100):
    code, authors = [], []
    for file in glob.glob(f"{folder_path}/*.csv"):
        df = pd.read_csv(file)
        df = df[df["flines"].notna() & df["full_path"].notna()]
        for _, row in df.iterrows():
            path = str(row["full_path"])
            snippet = str(row["flines"]).strip()
            author = os.path.basename(path.strip().split("/")[0])
            if len(snippet) > 50:
                code.append(snippet)
                authors.append(author)
                if len(code) >= max_per_file * len(glob.glob(f"{folder_path}/*.csv")):
                    break
    return code, authors

# ✅ STEP 5: Falcon-RW-1B LLM-based obfuscation
def obfuscate_code(codes):
    obf = []
    for c in tqdm(codes, desc="Obfuscating"):
        prompt = f"You are a code obfuscator. Rename vars, reorder blocks, add dead code. Keep logic.\nJava Code:\n{c[:200]}\nObfuscated Java Code:"
        try:
            result = llm(prompt, max_new_tokens=200, do_sample=True, temperature=0.9)[0]['generated_text']
            obf_code = result.split("Obfuscated Java Code:")[-1].strip()
        except:
            obf_code = c
        obf.append(obf_code)
    return obf

# ✅ STEP 6: CodeBERT Embeddings
@torch.no_grad()
def get_embeddings(snippets, batch_size=4, max_len=256):
    embs = []
    for i in tqdm(range(0, len(snippets), batch_size), desc="Embedding"):
        batch = snippets[i:i+batch_size]
        try:
            tokens = tokenizer(batch, return_tensors="pt", padding="max_length", truncation=True, max_length=max_len).to(device)
            output = codebert(**tokens).last_hidden_state.mean(dim=1).cpu().numpy()
            embs.extend(output)
        except: continue
    return np.array(embs, dtype=np.float32)

# ✅ STEP 7: Beta-VAE
class BetaVAE(nn.Module):
    def __init__(self, input_dim=768, latent_dim=64, beta=4.0):
        super().__init__()
        self.beta = beta
        self.encoder = nn.Sequential(nn.Linear(input_dim, 256), nn.ReLU(), nn.Linear(256, 128), nn.ReLU())
        self.mu = nn.Linear(128, latent_dim)
        self.logvar = nn.Linear(128, latent_dim)
        self.decoder = nn.Sequential(nn.Linear(latent_dim, 128), nn.ReLU(), nn.Linear(128, 256), nn.ReLU(), nn.Linear(256, input_dim))
    def reparam(self, mu, logvar):
        return mu + torch.randn_like(logvar) * torch.exp(0.5 * logvar)
    def forward(self, x):
        h = self.encoder(x)
        mu, logvar = self.mu(h), self.logvar(h)
        z = self.reparam(mu, logvar)
        return self.decoder(z), mu, logvar
    def loss_fn(self, x, recon, mu, logvar):
        mse = nn.functional.mse_loss(recon, x, reduction='sum')
        kl = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        return mse + self.beta * kl

def train_vae(X, beta=4.0, epochs=10):
    vae = BetaVAE(beta=beta).to(device)
    optimizer = torch.optim.Adam(vae.parameters(), lr=1e-3)
    loader = torch.utils.data.DataLoader(torch.tensor(X, dtype=torch.float32), batch_size=16, shuffle=True)
    for ep in range(epochs):
        total = 0
        for batch in loader:
            batch = batch.to(device)
            recon, mu, logvar = vae(batch)
            loss = vae.loss_fn(batch, recon, mu, logvar)
            optimizer.zero_grad(); loss.backward(); optimizer.step()
            total += loss.item()
        print(f"Epoch {ep+1}, Loss: {total:.2f}")
    return vae

# ✅ STEP 8: Extract features
@torch.no_grad()
def extract_features(vae, X):
    X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
    recon, mu, _ = vae(X_tensor)
    errors = ((X_tensor - recon)**2).mean(dim=1).cpu().numpy()
    return np.hstack([mu.cpu().numpy(), errors.reshape(-1,1)]), errors

# ✅ STEP 9: Evaluation
def evaluate_all(features, labels, errors, authors, embeddings):
    fpr, tpr, thresholds = roc_curve(labels, errors)
    best_idx = np.argmax(tpr - fpr)
    threshold = thresholds[best_idx]
    pred = (errors > threshold).astype(int)

    print(f"\nAnomaly Detection (Threshold={threshold:.4f}):")
    print(classification_report(labels, pred))
    print("Confusion Matrix:\n", confusion_matrix(labels, pred))

    auc_score = auc(fpr, tpr)
    plt.figure(figsize=(6,4))
    plt.plot(fpr, tpr, label=f"AUC = {auc_score:.2f}", color='blue')
    plt.plot([0,1],[0,1],'--',color='gray')
    plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC Curve")
    plt.grid(True); plt.legend(); plt.tight_layout(); plt.show()

    # Authorship Attribution
    le = LabelEncoder()
    y = le.fit_transform(authors)
    X_train, X_test, y_train, y_test = train_test_split(embeddings, y, stratify=y, test_size=0.2)
    clf = RandomForestClassifier(n_estimators=300, max_depth=20)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print("\nAuthor Attribution:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ✅ STEP 10: Run Full Pipeline
csv_folder = "/content/drive/MyDrive/Dataset1"  # Your folder with gcj2017.csv etc.

code, authors = load_code_from_csvs(csv_folder, max_per_file=100)
obf_code = obfuscate_code(code)

X_clean = get_embeddings(code)
X_obf = get_embeddings(obf_code)

vae_clean = train_vae(X_clean, beta=4.0, epochs=10)
X_all = np.vstack([X_clean, X_obf])
y_bin = np.array([0]*len(X_clean) + [1]*len(X_obf))

features, errors = extract_features(vae_clean, X_all)
evaluate_all(features, y_bin, errors, authors, X_clean)
