In [None]:
import os
import json
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
from pathlib import Path

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_samples

In [None]:
 # embedding

# set up the model
NAME = ' '

def load_malicious_code(base_path):
    """load malicious code from the specified directory"""
    malicious_codes = []
    file_paths = []
    
    for root, _, files in os.walk(base_path):
        for file in files:
            if file.startswith('._') or not file.endswith('.py'):
                continue
            file_path = os.path.join(root, file)
            try:
                for encoding in ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']:
                    try:
                        with open(file_path, 'r', encoding=encoding) as f:
                            code = f.read()
                            if code.strip() and isinstance(code, str):
                                malicious_codes.append(code)
                                file_paths.append(file_path)
                                break
                    except UnicodeDecodeError:
                        continue
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
    
    print(f"Successfully loaded {len(malicious_codes)} files")
    return malicious_codes, file_paths

def split_code_into_segments(code, max_length=512):
    """break your code into smaller pieces to avoid long inputs"""
    lines = code.split('\n')
    segments = []
    current_segment = []

    for line in lines:
        current_segment.append(line)
        if len(' '.join(current_segment).split()) > max_length:
            segments.append('\n'.join(current_segment[:-1])) 
            current_segment = [current_segment[-1]] 

    if current_segment:
        segments.append('\n'.join(current_segment))

    return segments

def attention_based_encoding(code, tokenizer, model, device='cuda' if torch.cuda.is_available() else 'cpu'):
    """use code bert to encode the code"""
    try:
        inputs = tokenizer(code, return_tensors="pt", truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    except Exception as e:
        print(f"Error encoding: {e}")
        return None

def attention_based_encoding_segmented(code, tokenizer, model, device='cuda'):
    """each code snippet is encoded and the encoding of each snippet is returned"""
    segments = split_code_into_segments(code)
    encoded_segments = []

    for segment in segments:
        encoding = attention_based_encoding(segment, tokenizer, model, device)
        if encoding is not None:
            encoded_segments.append(encoding)

    return np.mean(encoded_segments, axis=0) if encoded_segments else None

def save_embeddings_to_json(base_path, output_path):
    """calculate and save the code embedding"""
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")

    tokenizer = AutoTokenizer.from_pretrained(NAME)
    model = AutoModel.from_pretrained(NAME).to(device)

    print("Loading malicious code...")
    codes, file_paths = load_malicious_code(base_path)

    print("Encoding code samples...")
    encodings = []
    valid_indices = []
    
    for i in tqdm(range(len(codes))):
        code = codes[i]
        encoding = attention_based_encoding_segmented(code, tokenizer, model, device)
        if encoding is not None:
            encodings.append(encoding)
            valid_indices.append(i)

    X = np.vstack(encodings)

    # save the embedding vector to a json file
    embedding_output = {
        "file_paths": file_paths,
        "embeddings": X.tolist(),
    }

    output_file = Path(output_path) / "malware_embeddings_demo.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(embedding_output, f, indent=2, ensure_ascii=False)

    print(f"Embeddings saved to {output_file}")

base_path = " "
output_path = " "

os.makedirs(output_path, exist_ok=True)
save_embeddings_to_json(base_path, output_path)


In [None]:
def load_embeddings_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        
    embeddings = []
    file_paths = []
    source_codes = []
    
    for item in data:  # No More Data Get Data
        embeddings.append(item["embedding"])  
        file_paths.append(item["file_path"])  
        source_codes.append(item["source_code"])  
    
    return np.array(embeddings), file_paths, source_codes

def calculate_cluster_similarity(X_scaled, cluster_labels, cluster_id):
    """the mean cosine similarity of the samples within the cluster was calculated"""
    cluster_samples = X_scaled[cluster_labels == cluster_id]
    if len(cluster_samples) < 2:
        return 1.0
    similarities = cosine_similarity(cluster_samples)
    upper_tri = similarities[np.triu_indices(len(similarities), k=1)]
    return np.mean(upper_tri)



def numpy_default(o):
    if isinstance(o, np.int64):
        return int(o)  
    if isinstance(o, np.float64):
        return float(o)  
    if isinstance(o, np.ndarray):  
        return o.tolist()
    raise TypeError(f"Object of type {o.__class__.__name__} is not JSON serializable")

def save_cluster_results(cluster_summary, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        # Use The Default Parameter To Specify A Custom Serialization Method
        json.dump(cluster_summary, f, indent=2, ensure_ascii=False, default=numpy_default)

def perform_clustering(X_scaled, file_paths, source_codes, n_clusters=500, min_similarity=0.7, max_cluster_size=100):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=20, max_iter=500)
    clusters = kmeans.fit_predict(X_scaled)
    sample_silhouette_values = silhouette_samples(X_scaled, clusters)
    
    cluster_summary = {}
    valid_cluster_count = 0
    total_clustered_nodes = 0
    total_similarity = 0  # accumulate the similarity of all clusters
    valid_cluster_similarity_count = 0  # the number of valid clusters
    total_cluster_size = 0  # used to accumulate the size of all valid clusters
    
    for cluster_id in tqdm(range(n_clusters), desc="Processing Clusters", unit="cluster"):
        cluster_indices = np.where(clusters == cluster_id)[0]
        
        if len(cluster_indices) < 2:
            continue
        
        cluster_similarity = calculate_cluster_similarity(X_scaled, clusters, cluster_id)
        
        if cluster_similarity < min_similarity:
            continue
        
        cluster_members = []
        cluster_silhouette_values = []
        
        for idx in cluster_indices:
            member = {
                "source_code": source_codes[idx],  
                "file_path": file_paths[idx],  
                "distance_to_center": float(np.linalg.norm(X_scaled[idx] - kmeans.cluster_centers_[cluster_id])),
                "silhouette_score": float(sample_silhouette_values[idx]),
                # "embedding": X_scaled[idx]  
            }
            cluster_members.append(member)
            cluster_silhouette_values.append(sample_silhouette_values[idx])
        
        # Sort The Clusters And Filter The Samples With Low Profile Coefficients
        cluster_members.sort(key=lambda x: (-x["silhouette_score"], x["distance_to_center"]))
        filtered_members = [m for m in cluster_members if m["silhouette_score"] > 0.3]
        
        if len(filtered_members) < 2:
            continue
            
        # Update The Statistics For The Cluster
        cluster_summary[str(valid_cluster_count)] = {
            "size": len(filtered_members),
            "average_similarity": float(cluster_similarity),
            "average_silhouette": float(np.mean(cluster_silhouette_values)),
            "samples": filtered_members
        }
        
        # Cumulative Similarity And Number Of Valid Clusters
        total_similarity += cluster_similarity
        valid_cluster_similarity_count += 1
        
        total_clustered_nodes += len(filtered_members)
        total_cluster_size += len(filtered_members)  
        valid_cluster_count += 1

    # Calculate The Average Similarity Of All Valid Clusters
    if valid_cluster_similarity_count > 0:
        average_similarity_all_clusters = total_similarity / valid_cluster_similarity_count
    else:
        average_similarity_all_clusters = 0

    # Calculate The Average Number Of Packets Per Active Cluster
    average_cluster_size = total_cluster_size / valid_cluster_count if valid_cluster_count > 0 else 0

    return cluster_summary, total_clustered_nodes, average_similarity_all_clusters, valid_cluster_count, average_cluster_size


if __name__ == "__main__":
    embedding_file_path = " "
    X_scaled, file_paths, source_codes = load_embeddings_from_json(embedding_file_path)

    cluster_summary, total_clustered_nodes, average_similarity_all_clusters, total_clusters, average_cluster_size = perform_clustering(X_scaled, file_paths, source_codes, n_clusters=500)

    print(f"Total number of nodes clustered: {total_clustered_nodes}")
    print(f"Average similarity of all clusters: {average_similarity_all_clusters:.4f}")
    print(f"Total number of valid clusters: {total_clusters}")
    print(f"Average number of packages per cluster: {average_cluster_size:.4f}")

    output_file = " "
    save_cluster_results(cluster_summary, output_file)


In [None]:
# NPM
# 
# Total number of nodes clustered: 2994
# Average similarity of all clusters: 0.9999
# Total number of valid clusters: 157
# Average number of packages per cluster: 19.0701

# PyPI
# 
# Total number of nodes clustered: 4365
# Average similarity of all clusters: 0.9993
# Total number of valid clusters: 295
# Average number of packages per cluster: 14.7966

# Ruby
# 
# Total number of nodes clustered: 83
# Average similarity of all clusters: 0.9994
# Total number of valid clusters: 37
# Average number of packages per cluster: 2.2432