In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [3]:
# Load data
url = '/content/spam.csv'
data = pd.read_csv(url, encoding='ISO-8859-1')

In [4]:
# Drop unwanted columns
data = data.drop(columns=data.columns[2:5])
data.columns = ['Category', 'Message']

In [5]:
# Check for null values
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [6]:
# Convert 'spam' to 1 and 'ham' to 0
data['spam'] = data['Category'].apply(lambda x: 1 if x == 'spam' else 0)

In [7]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data.Message, data.spam, test_size=0.2, random_state=42)

In [8]:
# Text vectorization using CountVectorizer
vectorizer = CountVectorizer()
X_train_count = vectorizer.fit_transform(X_train)

In [9]:
# Multinomial Naive Bayes model
model = MultinomialNB()
model.fit(X_train_count, y_train)

In [10]:
# Evaluate model
X_test_count = vectorizer.transform(X_test)
accuracy = model.score(X_test_count, y_test)

print(f'Accuracy: {accuracy}')

Accuracy: 0.9838565022421525


In [11]:
# Pipeline for training and evaluation
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

clf.fit(X_train, y_train)
accuracy_pipeline = clf.score(X_test, y_test)
print(f'Accuracy using pipeline: {accuracy_pipeline}')

Accuracy using pipeline: 0.9838565022421525


In [12]:
# Pre-trained model
new_sentences = [
    "Your account have 100 debeted, is waiting to be collected. Simply text the password 'MIX' to 85069 to verify. Get Usher and Britney. FML"
]

In [13]:
# Predictions
predictions = model.predict(vectorizer.transform(new_sentences))

for sentence, prediction in zip(new_sentences, predictions):
    if prediction == 1:
        print(f"'{sentence}' is a spam message.")
    else:
        print(f"'{sentence}' is not a spam message.")

'Your account have 100 debeted, is waiting to be collected. Simply text the password 'MIX' to 85069 to verify. Get Usher and Britney. FML' is a spam message.
