# Spam Email Classification

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [1]:
df = pd.read_csv("mail_data.csv") # creating dataframe 


print(df.head())
print(df.columns)

NameError: name 'pd' is not defined

In [None]:
print(df.isnull().sum())  #Check missing values 

# handling missing values
df.dropna(inplace= True)  

# handling duplicates 
df.drop_duplicates(inplace= True)

In [None]:
plt.figure(figsize=(6,4)) #Learning data occurence
sns.countplot(x = df["Category"], palette= 'coolwarm') 
plt.title("Class Distribution: Ham vs Spam")  
plt.xlabel("Category")  
plt.ylabel("Count")  
plt.show()

Classifying ham as 0 and spam as 1 for the classification model 

In [None]:
df["Category"] = df["Category"].map({'ham' : 0, 'spam' :1})

Getting sample datas 

In [None]:
print(df["Message"].sample(5, random_state=42))  # Print 5 random messages

Checking special chars

In [None]:
import re  

# Checking special characters in df 
special_characters = df[df["Message"].str.contains(r"[@#$%^&*()<>?/|}{~:]", regex=True)]
print(special_characters.head(5))


checking html_tags 

In [None]:
html_tags = df[df["Message"].str.contains(r"<.*?>", regex = True)] 
print(html_tags.head(5))

Detecting Non-English chars 

In [None]:
from langdetect import detect
  
# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except:
        return "error" 
    
df["Language"] = df["Message"].apply(detect_language) 

print(df["Language"].value_counts())

Detect Irreveleant characters

In [None]:
# Find rows with too many numbers or symbols
weird_chars = df[df["Message"].str.contains(r"[^a-zA-Z0-9\s]", regex=True)]
print(weird_chars.sample(5))


## Combine Multiple Datasets for Diversity

In [None]:
# Load CSV files without assuming headers
df1 = pd.read_csv("mail_data.csv", header=None)
df2 = pd.read_csv("spam after new update.csv", header=None)

# Print column names to debug
print("File 1 Columns:", df1.columns)
print("File 2 Columns:", df2.columns)

In [None]:
# Getting data from CSV files 
#df1 = pd.read_csv("mail_data.csv", encoding="ISO-8859-1")
#df2 = pd.read_csv("spam after new update.csv", encoding="ISO-8859-1") 

df1 = pd.read_csv("mail_data.csv", header=None, usecols=[0, 1], names=['Category', 'Message'] , encoding="ISO-8859-1")
df2 = pd.read_csv("spam after new update.csv", header=None, usecols=[0, 1], names=['Category', 'Message'] , encoding="ISO-8859-1")

# Remove rows where Category or Message is missing
df1.dropna(subset=['Category', 'Message'], inplace=True)
df2.dropna(subset=['Category', 'Message'], inplace=True)


df = pd.concat([df1, df2], ignore_index= True) 

df.to_csv("data.csv" , index = False)

Remaking the above steps of EDA for the combined dataset 

In [None]:
print(df.isnull().sum())  #Check missing values 

# handling missing values
df.dropna(inplace= True)  

# handling duplicates 
df.drop_duplicates(inplace= True)

In [None]:
plt.figure(figsize=(6,4)) #Learning data occurence
sns.countplot(x = df["Category"], palette= 'coolwarm') 
plt.title("Class Distribution: Ham vs Spam")  
plt.xlabel("Category")  
plt.ylabel("Count")  
plt.show()

The data is imbalanced , hence applying SMOTE 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer

# Load dataset
df = pd.read_csv("data.csv")

# ✅ Drop rows where Category or Message is NaN
df.dropna(subset=['Category', 'Message'], inplace=True)

# Convert "Category" column to binary (0 = ham, 1 = spam)
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})

# ✅ Drop rows where Category is NaN after mapping
df.dropna(subset=['Category'], inplace=True)

# Convert text messages into numerical form using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Message'])  
y = df['Category']  # Target labels

# ✅ Ensure y has no NaN values before SMOTE
print("Missing values in y:", y.isna().sum())  # Should print 0

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert back to DataFrame for visualization
df_resampled = pd.DataFrame({'Category': y_resampled})  
df_resampled['Category'] = df_resampled['Category'].map({0: 'ham', 1: 'spam'})  # Convert back to labels

# ✅ Replot Class Distribution After Balancing
plt.figure(figsize=(6, 4))
sns.countplot(x=df_resampled["Category"], palette='coolwarm')
plt.title("Class Distribution: Ham vs Spam (After Balancing)")
plt.xlabel("Category")
plt.ylabel("Count")
plt.show()
