# SPAM SMS DETECTION
## Build an AI model that can classify SMS messages as spam orlegitimate. Use techniques like TF-IDF or word embeddings withclassifiers like Naive Bayes, Logistic Regression, or Support Vector Machines to identify spam messages

In [12]:
# importing the necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.pipeline import Pipeline

In [13]:
# Loading the SMS spam dataset
df = pd.read_csv("D:\\Codesoft\\Spam\\spam.csv", encoding='latin-1')

In [14]:
# Preprocessing: Removing stop words and stemming
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

In [15]:
def preprocess_text(text):
    words = text.split()
    words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]
    return " ".join(words)

In [16]:
df['SMS'] = df['SMS'].apply(preprocess_text)

In [17]:
# Spliting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['SMS'], df['Label'], test_size=0.2, random_state=42)

In [18]:
# Creating a pipeline with TF-IDF vectorizer and Naive Bayes classifier
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

In [19]:
X_test.head(10)

3245    funni fact nobodi teach volcano 2 erupt, tsuna...
944     sent score sopha secondari applic schools. thi...
1044    know someon know fanci you. call 09058097218 f...
2484     promis get soon can. text morn let know made ok.
812     congratul ur award either å£500 cd gift vouche...
2973                       i'll text carlo let know, hang
2991                     k.i did't see you.:)k:)wher now?
2942                  message..no responce..what happend?
230     get gandhipuram walk cross cut road. right sid...
1181                                    flippin shit yet?
Name: SMS, dtype: object

In [20]:
predictions

array(['Not spam', 'Not spam', 'Not spam', ..., 'Not spam', 'Not spam',
       'spam'], dtype='<U8')

In [21]:
y_test.head(10)

3245    Not spam
944     Not spam
1044        spam
2484    Not spam
812         spam
2973    Not spam
2991    Not spam
2942    Not spam
230     Not spam
1181    Not spam
Name: Label, dtype: object

In [22]:
#checking the accuracy of the model
from sklearn.metrics import accuracy_score
accuracy_score(predictions,y_test)

0.9668161434977578

In [23]:
#displaying full data
import numpy as np
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
np.set_printoptions(threshold=np.inf)

In [24]:
#confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
classification_rep = classification_report(y_test, predictions)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_rep)

Confusion Matrix:
[[965   0]
 [ 37 113]]
Classification Report:
              precision    recall  f1-score   support

    Not spam       0.96      1.00      0.98       965
        spam       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [25]:
# Taking the user input
user_input = input("Enter an SMS message: ")

# Making predictions on the user input
user_prediction = model.predict([preprocess_text(user_input)])

# Displaying the prediction
print(f"Prediction: {user_prediction[0]}")

Enter an SMS message: congratul ur award either å£500 cd gift vouche...
Prediction: spam
