### Loading Preprocessed Data

In [8]:
import json
import numpy as np

# Load preprocessed data
with open('processed_data.json', 'r') as infile:
    data = json.load(infile)

X = np.array(data['sequences'])  # Tokenized and padded sequences
y = np.array(data['labels'])     # Corresponding labels (1 for vulnerable)

### Split Data Into Training and Validation

In [9]:
from sklearn.model_selection import train_test_split

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### Defining CNN Model

In [10]:
import tensorflow as tf
from keras.api.models import Sequential
from keras.api.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

# Define the CNN model
def build_cnn_model(vocab_size, embedding_dim=128, max_seq_length=100):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim),
        Conv1D(filters=128, kernel_size=5, activation='relu'),  # Learn local patterns
        GlobalMaxPooling1D(),  # Reduce sequence dimension
        Dense(64, activation='relu'),  # Fully connected layer
        Dropout(0.5),  # Regularization
        Dense(1, activation='sigmoid')  # Output layer for binary classification
    ])
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model

# Get vocab size from the vocab file
with open('vocab.json', 'r') as vocab_file:
    vocab = json.load(vocab_file)
vocab_size = len(vocab)

# Build the model
cnn_model = build_cnn_model(vocab_size=vocab_size, embedding_dim=128, max_seq_length=X.shape[1])

# Print model summary
cnn_model.summary()

### Training CNN

In [11]:
# Train the model
cnn_model.fit(
    X_train, y_train,
    epochs=10,  # Number of training epochs
    batch_size=32,  # Batch size for gradient updates
    validation_data=(X_val, y_val)
)

Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 61ms/step - accuracy: 0.9224 - loss: 0.6265 - val_accuracy: 1.0000 - val_loss: 0.4976
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 1.0000 - loss: 0.4813 - val_accuracy: 1.0000 - val_loss: 0.3698
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 1.0000 - loss: 0.3543 - val_accuracy: 1.0000 - val_loss: 0.2500
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 1.0000 - loss: 0.2592 - val_accuracy: 1.0000 - val_loss: 0.1514
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 1.0000 - loss: 0.1455 - val_accuracy: 1.0000 - val_loss: 0.0828
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 1.0000 - loss: 0.0708 - val_accuracy: 1.0000 - val_loss: 0.0420
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x75b1f0bb7e90>

### Evaluating Model

In [12]:
# Evaluate on validation data
results = cnn_model.evaluate(X_val, y_val)
print(f"Validation Loss: {results[0]}")
print(f"Validation Accuracy: {results[1]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 1.0000 - loss: 0.0025
Validation Loss: 0.002496663946658373
Validation Accuracy: 1.0


### Testing

In [13]:
# Predict on validation data
predictions = cnn_model.predict(X_val)
predictions = (predictions > 0.5).astype(int)  # Convert probabilities to binary labels

# Print a sample of predictions
print("Sample Predictions:", predictions[:10])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
Sample Predictions: [[1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]]


### Saving Model

In [14]:
# Save the trained model
cnn_model.save('cnn_vulnerability_detector.keras')

# To load the model later:
# cnn_model = tf.keras.models.load_model('cnn_vulnerability_detector.h5')