In [None]:
# Import libraries for data manipulation
import pandas as pd  # For handling data in DataFrame format
import numpy as np  # For numerical operations and array handling

# Import libraries for text preprocessing
import re  # For regular expression operations to clean text
import emoji  # For handling and removing emojis
from nltk.corpus import stopwords  # For accessing English stop words
from nltk.tokenize import word_tokenize  # For splitting text into words
from nltk.stem import WordNetLemmatizer  # For lemmatizing words

# Import libraries for feature extraction and modeling
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text to TF-IDF features
from sklearn.model_selection import train_test_split  # For splitting data into train/test sets
from sklearn.model_selection import cross_val_score  # For performing cross-validation
from sklearn.pipeline import Pipeline  # For creating a modeling pipeline
from sklearn.linear_model import LogisticRegression  # For Logistic Regression model
from sklearn.ensemble import RandomForestClassifier  # For Random Forest model
from xgboost import XGBClassifier  # For XGBoost model

# Import libraries for model evaluation
from sklearn.metrics import classification_report  # For detailed classification metrics
from sklearn.metrics import confusion_matrix  # For confusion matrix computation
from sklearn.metrics import roc_curve, auc  # For ROC curve and AUC calculation
from sklearn.metrics import precision_recall_curve  # For precision-recall curve

# Import libraries for visualization
import matplotlib.pyplot as plt  # For creating plots
import seaborn as sns  # For enhanced visualizations
from wordcloud import WordCloud  # For generating word clouds
from collections import Counter  # For counting word frequencies

# Import library for model persistence
import joblib  # For saving and loading models

# Import NLTK for NLP tasks
import nltk  # For natural language processing utilities

# Download required NLTK data
nltk.download('punkt')  # Download tokenizer data
nltk.download('stopwords')  # Download stop words data
nltk.download('wordnet')  # Download lemmatizer data

# Set random seed for reproducibility
np.random.seed(42)  # Ensure consistent results across runs

# Load the dataset
df = pd.read_csv('merged_spam_dataset.csv')  # Read the CSV file into a DataFrame
df=df.sample(100)
# Display basic dataset information
print("Dataset Info:")  # Print header for dataset info
print(df.info())  # Show dataset structure and data types

# Check for missing values
print("\nMissing Values:")  # Print header for missing values
print(df.isnull().sum())  # Display count of missing values per column

# Handle missing values
df = df.dropna(subset=['message', 'label'])  # Drop rows with missing message or label
print("\nMissing Values After Handling:")  # Print header for updated missing values
print(df.isnull().sum())  # Display updated missing values count

# Check for duplicates
print("\nDuplicate Rows:")  # Print header for duplicates
print(df.duplicated().sum())  # Display count of duplicate rows
df = df.drop_duplicates()  # Remove duplicate rows
print("Duplicate Rows After Handling:", df.duplicated().sum())  # Confirm duplicates removed

# Display class distribution
print("\nClass Distribution:")  # Print header for class distribution
print(df['label'].value_counts())  # Show counts of spam (1) and not spam (0)

# Plot 1: Class distribution bar plot
plt.figure(figsize=(6, 4))  # Set figure size for plot
sns.countplot(x='label', data=df)  # Create bar plot of label counts
plt.title('Class Distribution (0: Not Spam, 1: Spam)')  # Set plot title
plt.xlabel('Label')  # Set x-axis label
plt.ylabel('Count')  # Set y-axis label
plt.savefig('class_distribution.png')  # Save plot as PNG
plt.show()  # Display the plot

# Define comprehensive text preprocessing function
def preprocess_text(text):
    # Check if input is string, convert non-strings to empty string
    if not isinstance(text, str):
        text = ''  # Replace non-string inputs with empty string
    
    # Initialize lemmatizer for word normalization
    lemmatizer = WordNetLemmatizer()
    # Load English stop words
    stop_words = set(stopwords.words('english'))
    
    # Convert text to lowercase for consistency
    text = text.lower()
    
    # Remove emojis to clean text
    text = emoji.replace_emoji(text, replace='')
    
    # Remove URLs to eliminate web links
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses to reduce noise
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove numbers to focus on textual content
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize text into words
    tokens = word_tokenize(text)
    
    # Remove stop words and lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    
    # Join tokens back into a single string
    return ' '.join(tokens)

# Apply preprocessing to the message column
df['cleaned_message'] = df['message'].apply(preprocess_text)  # Create new column with cleaned text

# Remove empty messages after preprocessing
df = df[df['cleaned_message'] != '']  # Drop rows with empty cleaned messages
print("\nRows with Empty Messages Removed:", len(df))  # Print updated row count

# Display sample of cleaned text
print("\nSample of Cleaned Messages:")  # Print header for cleaned samples
print(df[['message', 'cleaned_message']].head())  # Show original vs cleaned text

# Define features and target
X = df['cleaned_message']  # Feature column (cleaned text)
y = df['label']  # Target column (0 or 1)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)  # 80-20 split with stratification

# Print dataset shapes
print("\nTraining Set Shape:", X_train.shape)  # Show training data dimensions
print("Testing Set Shape:", X_test.shape)  # Show testing data dimensions

# Define pipelines for multiple models
# Pipeline for Logistic Regression
lr_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),  # TF-IDF with unigrams and bigrams
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))  # Logistic Regression model
])

# Pipeline for Random Forest
rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),  # TF-IDF with unigrams and bigrams
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))  # Random Forest model
])

# Pipeline for XGBoost
xgb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),  # TF-IDF with unigrams and bigrams
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))  # XGBoost model
])

# Dictionary to store pipelines and their names
pipelines = {
    'Logistic Regression': lr_pipeline,
    'Random Forest': rf_pipeline,
    'XGBoost': xgb_pipeline
}

# Dictionary to store cross-validation scores
cv_scores = {}

# Perform 5-fold cross-validation for each model
for name, pipeline in pipelines.items():
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')  # Compute CV accuracy scores
    cv_scores[name] = scores  # Store scores
    print(f"\n{name} Cross-Validation Accuracy Scores: {scores}")  # Print scores for each fold
    print(f"{name} Mean CV Accuracy: {scores.mean():.4f}")  # Print mean accuracy
    print(f"{name} Standard Deviation: {scores.std():.4f}")  # Print standard deviation

# Select the best model based on mean CV accuracy
best_model_name = max(cv_scores, key=lambda x: cv_scores[x].mean())  # Find model with highest mean accuracy
best_pipeline = pipelines[best_model_name]  # Get the best pipeline
print(f"\nBest Model: {best_model_name} with Mean CV Accuracy: {cv_scores[best_model_name].mean():.4f}")  # Print best model

# Train the best pipeline on the full training set
best_pipeline.fit(X_train, y_train)  # Fit the best model to training data

# Make predictions on the test set
y_pred = best_pipeline.predict(X_test)  # Predict labels for test set

# Print classification report
print("\nClassification Report for Best Model:")  # Print header for classification report
print(classification_report(y_test, y_pred))  # Show precision, recall, F1-score

# Plot 2: Confusion matrix heatmap
cm = confusion_matrix(y_test, y_pred)  # Compute confusion matrix
plt.figure(figsize=(6, 4))  # Set figure size
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')  # Create heatmap with annotations
plt.title('Confusion Matrix')  # Set plot title
plt.xlabel('Predicted')  # Set x-axis label
plt.ylabel('Actual')  # Set y-axis label
plt.savefig('confusion_matrix.png')  # Save plot as PNG
plt.show()  # Display the plot

# Plot 3: ROC curve
y_pred_proba = best_pipeline.predict_proba(X_test)[:, 1]  # Get probabilities for positive class
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)  # Compute ROC curve
roc_auc = auc(fpr, tpr)  # Compute AUC
plt.figure(figsize=(6, 4))  # Set figure size
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')  # Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')  # Plot diagonal line
plt.title('ROC Curve')  # Set plot title
plt.xlabel('False Positive Rate')  # Set x-axis label
plt.ylabel('True Positive Rate')  # Set y-axis label
plt.legend()  # Show legend
plt.savefig('roc_curve.png')  # Save plot as PNG
plt.show()  # Display the plot

# Plot 4: Word cloud for spam emails
spam_text = ' '.join(df[df['label'] == 1]['cleaned_message'])  # Combine all spam messages
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(spam_text)  # Generate word cloud
plt.figure(figsize=(10, 5))  # Set figure size
plt.imshow(wordcloud, interpolation='bilinear')  # Display word cloud
plt.axis('off')  # Hide axes
plt.title('Word Cloud for Spam Emails')  # Set plot title
plt.savefig('spam_wordcloud.png')  # Save plot as PNG
plt.show()  # Display the plot

# Plot 5: Word cloud for non-spam emails
non_spam_text = ' '.join(df[df['label'] == 0]['cleaned_message'])  # Combine all non-spam messages
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(non_spam_text)  # Generate word cloud
plt.figure(figsize=(10, 5))  # Set figure size
plt.imshow(wordcloud, interpolation='bilinear')  # Display word cloud
plt.axis('off')  # Hide axes
plt.title('Word Cloud for Non-Spam Emails')  # Set plot title
plt.savefig('non_spam_wordcloud.png')  # Save plot as PNG
plt.show()  # Display the plot

# Plot 6: Top 10 words in spam emails
spam_words = ' '.join(df[df['label'] == 1]['cleaned_message']).split()  # Split spam text into words
spam_word_counts = Counter(spam_words)  # Count word frequencies
top_spam_words = spam_word_counts.most_common(10)  # Get top 10 words
words, counts = zip(*top_spam_words)  # Unzip words and counts
plt.figure(figsize=(8, 4))  # Set figure size
sns.barplot(x=list(counts), y=list(words))  # Create bar plot
plt.title('Top 10 Words in Spam Emails')  # Set plot title
plt.xlabel('Frequency')  # Set x-axis label
plt.ylabel('Words')  # Set y-axis label
plt.savefig('top_spam_words.png')  # Save plot as PNG
plt.show()  # Display the plot

# Plot 7: Top 10 words in non-spam emails
non_spam_words = ' '.join(df[df['label'] == 0]['cleaned_message']).split()  # Split non-spam text into words
non_spam_word_counts = Counter(non_spam_words)  # Count word frequencies
top_non_spam_words = non_spam_word_counts.most_common(10)  # Get top 10 words
words, counts = zip(*top_non_spam_words)  # Unzip words and counts
plt.figure(figsize=(8, 4))  # Set figure size
sns.barplot(x=list(counts), y=list(words))  # Create bar plot
plt.title('Top 10 Words in Non-Spam Emails')  # Set plot title
plt.xlabel('Frequency')  # Set x-axis label
plt.ylabel('Words')  # Set y-axis label
plt.savefig('top_non_spam_words.png')  # Save plot as PNG
plt.show()  # Display the plot

# Plot 8: Email length distribution by class
df['email_length'] = df['cleaned_message'].apply(len)  # Calculate length of cleaned messages
plt.figure(figsize=(8, 4))  # Set figure size
sns.histplot(data=df, x='email_length', hue='label', multiple='stack')  # Create stacked histogram
plt.title('Email Length Distribution by Class')  # Set plot title
plt.xlabel('Email Length (Characters)')  # Set x-axis label
plt.ylabel('Count')  # Set y-axis label
plt.savefig('email_length_distribution.png')  # Save plot as PNG
plt.show()  # Display the plot

# Plot 9: Feature importance (Top 10 TF-IDF features)
feature_names = best_pipeline.named_steps['tfidf'].get_feature_names_out()  # Get TF-IDF feature names
coef = best_pipeline.named_steps['classifier'].coef_[0] if best_model_name == 'Logistic Regression' else best_pipeline.named_steps['classifier'].feature_importances_  # Get coefficients or feature importances
top_indices = np.argsort(np.abs(coef))[-10:]  # Get indices of top 10 features
plt.figure(figsize=(8, 4))  # Set figure size
sns.barplot(x=coef[top_indices], y=[feature_names[i] for i in top_indices])  # Create bar plot
plt.title('Top 10 TF-IDF Features by Importance')  # Set plot title
plt.xlabel('Importance Value')  # Set x-axis label
plt.ylabel('Feature')  # Set y-axis label
plt.savefig('feature_importance.png')  # Save plot as PNG
plt.show()  # Display the plot

# Plot 10: Precision-recall curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)  # Compute precision-recall curve
plt.figure(figsize=(6, 4))  # Set figure size
plt.plot(recall, precision, label='Precision-Recall Curve')  # Plot curve
plt.title('Precision-Recall Curve')  # Set plot title
plt.xlabel('Recall')  # Set x-axis label
plt.ylabel('Precision')  # Set y-axis label
plt.legend()  # Show legend
plt.savefig('precision_recall_curve.png')  # Save plot as PNG
plt.show()  # Display the plot

# Save the best pipeline
joblib.dump(best_pipeline, 'spam_classifier_pipeline.pkl')  # Save the best model to file

# Print confirmation
print("\nBest Pipeline Saved Successfully.")  # Confirm model saving

ModuleNotFoundError: No module named 'xgboost'