# **SPAM Message Detection**

# Import necessary libraries

In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Load the dataset

In [None]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v2', 'v1']]
df.rename(columns={'v2': 'messages', 'v1': 'Label'}, inplace=True)

# Text cleaning function

In [None]:
stopwords_set = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^0-9a-zA-Z]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = " ".join(word for word in text.split() if word not in stopwords_set)
    return text

# Apply text cleaning to the messages
df['clean_text'] = df['messages'].apply(clean_text)


# Split the dataset into features (X) and labels (y)

In [None]:
X = df['clean_text']
y = df['Label']

# Define the pipeline outside of the classify function

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression

# Define the pipeline
pipeline_model = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression())
])


# Train-test split and model training



In [None]:
from sklearn.model_selection import train_test_split

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True, stratify=y)

# Model training
pipeline_model.fit(x_train, y_train)


# Print classification report

In [None]:
from sklearn.metrics import classification_report


predictions = pipeline_model.predict(x_test)
report = classification_report(y_test, predictions)
print("Test Set Classification Report:\n", report)


Test Set Classification Report:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1206
        spam       0.99      0.77      0.87       187

    accuracy                           0.97      1393
   macro avg       0.98      0.88      0.92      1393
weighted avg       0.97      0.97      0.97      1393



# Providing a SPAM message

In [None]:
new_data = ["you just got 10000 get by clicking here"]

# Clean and Preprocess New Input
new_data_cleaned = [clean_text(text) for text in new_data]

# Make Predictions
predictions = pipeline_model.predict(new_data_cleaned)

# Display Predictions
print("Model Prediction:", predictions)


Model Prediction: ['spam']


# Providing a HAM message

In [None]:
new_data = ["Hey, How are you ?"]

# Clean and Preprocess New Input
new_data_cleaned = [clean_text(text) for text in new_data]

# Make Predictions
predictions = pipeline_model.predict(new_data_cleaned)

# Display Predictions
print("Model Prediction:", predictions)
