In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import time


In [None]:
# reading the file using pandas library

df=pd.read_csv(r"/home/sunail/Downloads/spam.csv")

In [None]:
# Exploring the dataset 
df.head(5)

In [None]:
df.tail()

In [None]:
# lets check the features 
df.columns 

In [None]:
df.describe()

In [None]:
# check for the datatypes
df.dtypes

In [None]:
df.info()

In [None]:
# checking for null values
df.isna().sum()

In [None]:
df.size

In [None]:
df.head(10)

In [None]:
# (we have to create a new column to make a difference between spam and not spam in the form of numbers(0,1)

df['spam']=df['Category'].apply(lambda x:1 if x=='spam' else 0)

In [None]:
df

In [None]:
df.groupby('Category').count()

In [None]:
X_train,X_test,y_train,y_test=train_test_split(df.Message,df.spam,test_size=0.25)

In [None]:
# lets check the size of  our data  

X_train.size

In [None]:
X_test.size

In [None]:
y_train.size

In [None]:
y_test.size

# we convert the textual data into a numerical form that machine learning models can understand.

In [None]:
vectorizer=TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized= vectorizer.transform(X_test)

In [None]:
# naive bayes

In [None]:
nb_model = MultinomialNB()
start_time = time.time()
nb_model.fit(X_train_vectorized, y_train)
nb_time = time.time() - start_time
nb_predictions = nb_model.predict(X_test_vectorized)

In [None]:
# Decision Tree (J48 equivalent)
dt_model = DecisionTreeClassifier()
start_time = time.time()
dt_model.fit(X_train_vectorized, y_train)
dt_time = time.time() - start_time
dt_predictions = dt_model.predict(X_test_vectorized)


# Evaluate Performance of the models

In [None]:
def evaluate_model(predictions, model_name):
   accuracy = metrics.accuracy_score(y_test, predictions)
   error_rate = 1 - accuracy
   print(f"{model_name} Accuracy: {accuracy:.4f}")
   print(f"{model_name} Error Rate: {error_rate:.4f}")

# Evaluate Naive Bayes
evaluate_model(nb_predictions, "Naive Bayes")
print(f"Naive Bayes Processing Time: {nb_time:.4f} seconds\n")

# Evaluate Decision Tree
evaluate_model(dt_predictions, "Decision Tree (J48)")
print(f"Decision Tree Processing Time: {dt_time:.4f} seconds")
