# Install necessary packages

In [None]:
!pip install scikit-learn pandas numpy matplotlib seaborn

# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Load the Dataset

In [None]:
# Load dataset
df = pd.read_csv('spam_ham_dataset.csv')

# Display the first few rows of the dataset
df.head()


# Data Exploration

In [None]:
# Check for missing values
df.isnull().sum()

# Display column names and types
df.info()

# Check the distribution of spam vs non-spam emails
df['label'].value_counts()


# Preprocessing Data

In [None]:
# Convert the labels (spam, ham) to numerical values (0 for ham, 1 for spam)
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Convert the text column (e.g., 'text') into numerical features using TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df['text']).toarray()

# Target variable
y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Train a Classification Model

In [None]:
# Initialize the Logistic Regression model
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train, y_train)


# Model Evaluation

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Display the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Display the classification report
report = classification_report(y_test, y_pred, target_names=['Ham', 'Spam'])
print(report)


# Model Improvement

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Logistic Regression
param_grid = {
    'C': [0.1, 1, 10],  # Regularization strength
    'solver': ['liblinear', 'saga']  # Optimization algorithms
}

# Apply GridSearchCV to find the best parameters
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Display the best parameters
print("Best parameters found: ", grid_search.best_params_)

# Evaluate the tuned model
best_model = grid_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
print(f'Accuracy after tuning: {accuracy_tuned * 100:.2f}%')
