In [2]:
import pandas as pd
import requests
from tensorflow import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Load the dataset
data = pd.read_csv('spam.csv')
data = data.rename(columns={'v1': 'Label', 'v2': 'EmailText'})

In [4]:
# Perform URL analysis and domain reputation check using an API (replace API_KEY with your actual API key)
def analyze_url(url):
    api_key = 'API_KEY'
    endpoint = f'https://url-analysis.com/api/v1/analyze/{url}?apikey={api_key}'
    response = requests.get(endpoint)
    result = response.json()
    return result['result']

In [None]:
data['URLAnalysis'] = data['EmailText'].apply(lambda x: analyze_url(x) if 'http' in x else None)
data['DomainReputation'] = data['URLAnalysis'].apply(lambda x: x['reputation'] if x else None)

In [6]:
# Split the dataset into features (email content) and labels (spam or ham)
X = data['EmailText']
y = data['Label']

In [7]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Define the preprocessing steps for text data
text_preprocessing = Pipeline([
    ('vect', CountVectorizer()),
])

In [9]:
# Define the preprocessing steps for numerical data
numeric_preprocessing = Pipeline([
    ('scale', StandardScaler()),
])

In [10]:
# Combine the preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer([
    ('text_pre', text_preprocessing, 'EmailText'),
    ('num_pre', numeric_preprocessing, ['DomainReputation'])
])


In [11]:
# Define the deep learning model using Keras
def create_model():
    model = Sequential()
    model.add(Embedding(5000, 32, input_length=100))
    model.add(LSTM(32))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [12]:
# Wrap the Keras model using KerasClassifier
model = KerasClassifier(build_fn=create_model, epochs=5, batch_size=64, verbose=0)


In [13]:
# Define the pipeline combining the preprocessor and the deep learning model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

In [14]:
# Perform hyperparameter tuning using GridSearchCV
param_grid = {
    'model__epochs': [5, 10],
    'model__batch_size': [32, 64],
}
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

IndexError: tuple index out of range

In [None]:
# Predict the labels for the test set
y_pred = grid_search.predict(X_test)


In [16]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
f1 = f1_score(y_test, y_pred, pos_label='spam')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Best parameters:", grid_search.best_params_)

NameError: name 'y_pred' is not defined