In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

# Load the dataset from the specified path
dataset_path = '/content/drive/MyDrive/AI/RAG/spam_detection/spam_ham_dataset.csv'
df = pd.read_csv(dataset_path)

# Display the first few rows of the dataset to check
df.head()

from sentence_transformers import SentenceTransformer
import numpy as np

# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for all the emails in the dataset
email_embeddings = [model.encode(email) for email in df['text']]

# Convert the list of embeddings into a numpy array
email_embeddings = np.array(email_embeddings)

!pip install transformers sentence-transformers langchain pinecone-client[grpc] faiss-cpu

import faiss

# Get the embedding dimension
dimension = email_embeddings.shape[1]

# Create the FAISS index
index = faiss.IndexFlatL2(dimension)

# Add the email embeddings to the index
index.add(email_embeddings)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Prepare the labels (0 for ham, 1 for spam)
labels = df['label_num'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(email_embeddings, labels, test_size=0.2, random_state=42)

# Train the Logistic Regression classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

from google.colab import drive
import os
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import requests
import json

# Mount Google Drive
drive.mount('/content/drive')

# Set paths
base_dir = "/content/drive/MyDrive/AI/RAG/spam_detection"
dataset_path = f"{base_dir}/spam_ham_dataset.csv"

# Load dataset
data = pd.read_csv(dataset_path)

# Clean email text
data['cleaned_text'] = data['text'].apply(
    lambda x: re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', x.lower()))
)

# Load pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and efficient

# Generate embeddings
data['embeddings'] = data['cleaned_text'].apply(lambda x: model.encode(x))

# Convert embeddings into a numpy array
embeddings = np.array(data['embeddings'].tolist()).astype('float32')

# Create a FAISS index
dimension = embeddings.shape[1]  # Dimension of the embeddings
index = faiss.IndexFlatL2(dimension)  # L2 distance (Euclidean)

# Add embeddings to the index
index.add(embeddings)
print(f"Number of vectors in the index: {index.ntotal}")

# Function to search for similar emails
def search_similar_emails(query_embedding, k=5):
    query_embedding = np.array([query_embedding]).astype('float32')
    distances, indices = index.search(query_embedding, k)  # k nearest neighbors
    return indices, distances

# Sample spam classification function using Gemini API
def gemini_llm(prompt):
    api_key = 'AIzaSyA0t6XAKE7eNH8DyCGOvD2ntQKPOMJq830'  # Replace with your actual API key
    gemini_url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent"

    # Send request to Gemini API
    response = requests.post(
        gemini_url,
        headers={'Content-Type': 'application/json'},
        params={'key': api_key},
        json={
            "contents": [{
                "parts": [{"text": prompt}]
            }]
        }
    )

    # Extract the response text
    response_data = response.json()
    if 'candidates' in response_data:
        return response_data['candidates'][0]['content']['parts'][0]['text']
    else:
        return "Error in response"

# Function to classify an email and build context using the retrieved similar emails
def classify_email(email_text):
    query_embedding = model.encode(email_text)
    indices, _ = search_similar_emails(query_embedding)

    # Build context from retrieved similar emails
    context = " ".join(data.iloc[idx]['text'] for idx in indices[0])

    # Create the classification prompt
    prompt = f"Based on the following context and email:\nContext: {context}\nEmail: {email_text}\nClassify as spam or not spam."

    # Use the Gemini API to classify the email
    return gemini_llm(prompt)

# Test with a sample email
sample_email = "Click here to claim your free gift card now!"
classified_result = classify_email(sample_email)
print("Spam Classification Result:", classified_result)

# Save the model to Google Drive
import pickle

# Define the path to save the model in Google Drive
model_save_path = '/content/drive/MyDrive/AI/RAG/spam_detection/RAGgpu_model.pkl'  # Adjust to your desired path

# Save the RAG model using pickle
with open(model_save_path, 'wb') as f:
    pickle.dump(model, f)

print(f"RAG model saved to {model_save_path}")

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pickle

# Load the saved model
with open('/content/drive/MyDrive/AI/RAG/spam_detection/RAGgpu_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Function to classify an email (same as before)
def classify_email(email_text, model, k=5):
    query_embedding = model.encode(email_text)
    indices, _ = search_similar_emails(query_embedding, k)

    # Build context from retrieved similar emails
    context = " ".join(data.iloc[idx]['text'] for idx in indices[0])

    # Create the classification prompt
    prompt = f"Based on the following context and email:\nContext: {context}\nEmail: {email_text}\nClassify as spam or not spam."

    # Use the Gemini API to classify the email
    return gemini_llm(prompt)

# Generate predictions for the entire dataset
predictions = []
for email in data['text']:
    result = classify_email(email, model=loaded_model)
    # Assuming the result returned by Gemini API is either "spam" or "not spam"
    predictions.append('spam' if 'spam' in result.lower() else 'ham')

# Compare predictions with actual labels
y_true = data['label'].apply(lambda x: 'spam' if x == 1 else 'ham').tolist()  # Assuming label is 1 for spam, 0 for ham

# Calculate evaluation metrics
accuracy = accuracy_score(y_true, predictions)
precision = precision_score(y_true, predictions, pos_label='spam')
recall = recall_score(y_true, predictions, pos_label='spam')
f1 = f1_score(y_true, predictions, pos_label='spam')

# Display evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


# Debugging predictions
for email, true_label, pred in zip(data['text'][:10], y_true[:10], predictions[:10]):
    print(f"Email: {email}\nTrue Label: {true_label}\nPredicted: {pred}\n")

# Check label distribution
print("Label Distribution:")
print(data['label'].value_counts())

# Inspect a sample API response
sample_response = classify_email(data['text'][0], model=loaded_model)
print("Sample API Response:", sample_response)
