In [None]:
pip install pandas scikit-learn nltk


In [2]:
import pandas as pd
import numpy as np
import nltk
import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [None]:
import pandas as pd

url = "https://raw.githubusercontent.com/mohitgupta-1O1/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv"

df = pd.read_csv(url, encoding="latin-1", on_bad_lines='skip')[['v1', 'v2']]
df.columns = ['label', 'message']  # Rename columns for clarity

df['label'] = df['label'].map({'ham': 0, 'spam': 1})

print(df.head())



   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


In [None]:
import nltk
import re
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK stopwords
nltk.download('stopwords')
nltk.download('punkt_tab')



def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  
    words = word_tokenize(text)  # Tokenize
    words = [word for word in words if word not in stopwords.words('english')] 
    return " ".join(words)


df['cleaned_message'] = df['message'].apply(clean_text)

print(df[['message', 'cleaned_message']])


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/innovapathinc/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/innovapathinc/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


                                                message  \
0     Go until jurong point, crazy.. Available only ...   
1                         Ok lar... Joking wif u oni...   
2     Free entry in 2 a wkly comp to win FA Cup fina...   
3     U dun say so early hor... U c already then say...   
4     Nah I don't think he goes to usf, he lives aro...   
...                                                 ...   
5567  This is the 2nd time we have tried 2 contact u...   
5568              Will Ì_ b going to esplanade fr home?   
5569  Pity, * was in mood for that. So...any other s...   
5570  The guy did some bitching but I acted like i'd...   
5571                         Rofl. Its true to its name   

                                        cleaned_message  
0     go jurong point crazy available bugis n great ...  
1                               ok lar joking wif u oni  
2     free entry wkly comp win fa cup final tkts st ...  
3                   u dun say early hor u c already say  
4

In [5]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_message'])  # Transform text data into vectors
y = df['label']


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
model = MultinomialNB()
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))


Accuracy: 0.9677
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.76      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



In [10]:
import joblib

In [None]:
joblib.dump(model, "spam_classifier_model.pkl")

joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


In [None]:
loaded_model = joblib.load("spam_classifier_model.pkl")

loaded_vectorizer = joblib.load("tfidf_vectorizer.pkl")

print("Model and vectorizer loaded successfully!")


Model and vectorizer loaded successfully!


In [None]:
def predict_spam(text):
    text_vectorized = loaded_vectorizer.transform([text])

    prediction = loaded_model.predict(text_vectorized)

    return "Spam" if prediction[0] == 1 else "Ham"

new_message = "let schedule a meet for further round next week "
print(f"Message: {new_message}")
print(f"Prediction: {predict_spam(new_message)}")


Message: let schedule a meet for further round next week 
Prediction: Ham
