In [6]:
import pandas as pd
import numpy as np

In [8]:
# Read the Excel file without the 'sep' argument
df = pd.read_csv("C:\\Users\\Pooja\\Downloads\\SMSSpamCollection",sep="\t" ,
                   names=["label", "message"]) 
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pooja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Text Cleaning

In [11]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps=PorterStemmer()

In [13]:
corpus = []

for i in range(len(df)):
    rp = re.sub('[^a-zA-Z]', " ", df["message"][i])  # Use re.sub on string
    rp = rp.lower()
    rp = rp.split()
    rp = [ps.stem(word) for word in rp if not word in set(stopwords.words("english"))]
    rp = " ".join(rp)
    corpus.append(rp)


# Vectorization

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1000)
X = cv.fit_transform(corpus).toarray()


In [16]:
y=pd.get_dummies(df["label"],drop_first=True)


**Train-Test Split**

In [17]:
from sklearn.model_selection import train_test_split
X_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=True)

# Modelling

**Navie Bayes Classifier with default parameter**

In [18]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)


# Prediction

In [20]:
ypred_test=model.predict(x_test)
ypred_train=model.predict(X_train)

# Evaluation

In [21]:
from sklearn.metrics import accuracy_score
print("Train Accuracy:",accuracy_score(y_train,ypred_train))
print("Test Accuracy:",accuracy_score(y_test,ypred_test))

Train Accuracy: 0.9833333333333333
Test Accuracy: 0.9826555023923444


# 1. Save and Load the Model and Vectorizer:



In [25]:
import joblib

# Save the trained model and vectorizer
joblib.dump(model, "email_spam_classifier.pkl")  # Save the model
joblib.dump(cv, "email_vectorizer.pkl")          # Save the vectorizer

# Load the model and vectorizer when needed
loaded_model = joblib.load("email_spam_classifier.pkl")  # Load the model
loaded_vectorizer = joblib.load("email_vectorizer.pkl")  # Load the vectorizer



# 2. Clean New Text:


In [26]:
 def clean_email(email):
    ps = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    
    # Step 1: Remove email addresses
    email = re.sub(r'\b[\w.%+-]+@[a-zA-Z.-]+\.[a-zA-Z]{2,}\b', " ", email)
    
    # Step 2: Remove URLs
    email = re.sub(r'http[s]?://\S+|www\.\S+', " ", email)
    
    # Step 3: Remove non-alphabetic characters
    email = re.sub('[^a-zA-Z]', " ", email)
    
    # Step 4: Convert to lowercase
    email = email.lower()
    
    # Step 5: Tokenize the text
    email = email.split()
    
    # Step 6: Remove stopwords and apply stemming
    email = [ps.stem(word) for word in email if word not in stop_words]
    
    # Step 7: Rejoin the processed words into a single string
    return " ".join(email)


# 3. Predict New Comments:


In [30]:
# Function to predict whether an email is spam or ham
def predict_email_spam(email):
    # Step 1: Clean the email
    cleaned_email = clean_email(email)
    
    # Step 2: Transform the email text to the format required by the vectorizer
    vectorized_email = loaded_vectorizer.transform([cleaned_email])
    
    # Step 3: Predict spam or ham using the loaded model
    prediction = loaded_model.predict(vectorized_email)
    
    # Step 4: Map the prediction to 'Spam' or 'Ham'
    return "Spam" if prediction[0] == 1 else "Ham"

# Example usage
new_email = """
Hi John,
This is a reminder for our meeting scheduled tomorrow at 10 AM. Let me know if you need to reschedule.
Best regards,
Sarah


"""
print("Prediction:", predict_email_spam(new_email))


Prediction: Ham
