In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df = pd.read_csv('spam.csv', encoding='latin-1')[['v1','v2']]
df.columns = ['label', 'text']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.head(10)

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [13]:
# Extract numerical features from the 'text' column
df['text_len'] = df['text'].apply(len)  # Length of each text (number of characters)
df['num_words'] = df['text'].apply(lambda x: len(x.split()))  # Number of words in each text
df['num_digits'] = df['text'].apply(lambda x: sum(c.isdigit() for c in x))  # Number of digits in each text

# Display the updated DataFrame
df.head()
    

Unnamed: 0,label,text,text_len,num_words,num_digits
0,0,"Go until jurong point, crazy.. Available only ...",111,20,0
1,0,Ok lar... Joking wif u oni...,29,6,0
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,28,25
3,0,U dun say so early hor... U c already then say...,49,11,0
4,0,"Nah I don't think he goes to usf, he lives aro...",61,13,0


In [17]:
# Features (X) and target (y)
X = df[['text_len', 'num_words', 'num_digits']]  # Feature columns
y = df['label']  # Target column

# Split dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Gaussian Naive Bayes model
gnb = GaussianNB()
gnb.fit(X_train_scaled, y_train)

# Make predictions
y_pred_gnb = gnb.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_gnb)
print(f"Gaussian Naive Bayes Accuracy: {accuracy:.2f}")

# Print the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_gnb))

Gaussian Naive Bayes Accuracy: 0.98

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.92      0.93      0.93       150

    accuracy                           0.98      1115
   macro avg       0.96      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

