## 📦 Import Required Libraries  
Importing all the necessary libraries for data processing, visualization, natural language processing (NLP), and model building.


In [None]:
#Importing all the libraries to be used
import string
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud
from collections import Counter
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings("ignore")

## 📥 Load Dataset and Preview Shape  
Loading the SMS Spam Collection dataset and selecting only the relevant columns (`v1` as label and `v2` as message text). Displaying the shape of the dataset.


In [None]:
df = pd.read_csv('data/spam.csv', encoding='latin-1')[['v1', 'v2']]
df.shape


## 📊 Visualize Class Distribution  
Plotting the count of each class (ham vs spam) to visualize data imbalance and distribution of target labels.

In [None]:
#Palette
cols= ["#4CAF50", "#E598D8"] 
#first of all let us evaluate the target and find out if our data is imbalanced or not
plt.figure(figsize=(12,8))
fg = sns.countplot(x= df["v1"], palette= cols)
fg.set_title("Count Plot of Classes", color="#58508d")
fg.set_xlabel("Classes", color="#58508d")
fg.set_ylabel("Number of Data points", color="#58508d")
plt.show()

## ⚠️ Precision Over Accuracy  
Since the dataset is imbalanced, we prioritize **precision** over accuracy during evaluation, as accuracy can be misleading in such scenarios.


In [None]:
df.info()

## 🔍 Sample Records  
Previewing 5 random samples from the dataset to get a better understanding of the data.


In [None]:
df.sample(5)

## 🏷️ Rename Columns  
Renaming the columns for clarity: `v1` becomes `target`, and `v2` becomes `text`.


In [None]:
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.head()

## 🔢 Encode Target Labels  
Encoding the categorical labels into numeric format: `ham` becomes 0 and `spam` becomes 1.


In [None]:
encoder=LabelEncoder()
df['target']=encoder.fit_transform(df['target'])
df.head()

## 📋 Check and Remove Duplicates  
Identifying and removing duplicate rows to ensure data quality and avoid data leakage.


In [None]:
df.duplicated().sum()

In [10]:
df.drop_duplicates(inplace=True)

## 🧠 Text Vectorization  
Converting the message text into a bag-of-words model using CountVectorizer, which turns text into numerical vectors.


In [19]:
# Tokenize & Vectorize
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['text'])
y = df['target']


## 🔀 Split Dataset  
Splitting the data into training and testing sets with an 80-20 ratio to evaluate model generalization.


In [20]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## 🌲 Train Random Forest Model  
Training a Random Forest Classifier with 100 estimators on the training data to predict spam messages.


In [None]:
#  Train Random Forest Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

## 📈 Evaluate Model Performance  
Evaluating model predictions using accuracy, confusion matrix, and classification report. Visualizing the confusion matrix with a heatmap.


In [None]:
#  Evaluate Model
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()
