<a href="https://colab.research.google.com/github/Sreelakshmi2023/MAIN/blob/main/CNN_FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import Dependencies**

In [2]:

# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from keras.callbacks import EarlyStopping


**Loading Dataset**

In [3]:
# Load the dataset
data = pd.read_csv("Benign_malicious.csv")

**Preprocessing the data**

In [6]:
# Determine the maximum length of the URLs
max_len = max(len(url) for url in data['url'])

# Preprocessing the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['url'])
X = tokenizer.texts_to_sequences(data['url'])
X = pad_sequences(X, maxlen=max_len)

**Encoding the labels**

In [7]:
# Encoding the labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['type'])

**Spliting the test and training sets**

In [8]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**Define CNN Model**

In [11]:
# Define vocabulary size
vocab_size = len(tokenizer.word_index) + 1
# Define embedding dimension
embedding_dim = 100

# Define the CNN model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

**Compile the model**

In [12]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


**Define early stopping callback**

In [13]:
# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

**Training the model**

In [14]:
# Train the model
model.fit(X_train, y_train, batch_size=128, epochs=20, validation_split=0.2, callbacks=[early_stopping])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


<keras.src.callbacks.History at 0x7eb085d08ee0>

**Evaluate the model**

In [15]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", accuracy)


Test Accuracy: 0.9668819308280945


**Function to predict the type of URL**

In [16]:
# Function to predict the type of URL
def predict_url_type(url):
    # Tokenize and pad the input URL
    encoded_url = tokenizer.texts_to_sequences([url])
    padded_url = pad_sequences(encoded_url, maxlen=max_len)
    # Make predictions
    prediction = model.predict(padded_url)
    # Decode the prediction
    if prediction[0] > 0.5:
        return "Malicious"
    else:
        return "Benign"

**Test the model with user input**

In [18]:
# Test the model with user input
user_input = input("Enter a URL: ")
prediction = predict_url_type(user_input)
print("Predicted URL Type:", prediction)

Enter a URL: http://www.pashminaonline.com/pure-pashminas
Predicted URL Type: Malicious
