<a href="https://colab.research.google.com/github/Ritishpatro/email_spam-detector/blob/main/email_spam_detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
df = pd.read_csv('mail_data.csv', encoding='latin-1')

In [8]:
df = df.rename(columns={'v1': 'label', 'v2': 'message'})
print(df.head())


  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [10]:
# Check for any missing values
print(df.isnull().sum())

# Get the count of spam vs. ham messages
print(df['Category'].value_counts())

Category    0
Message     0
dtype: int64
Category
ham     4825
spam     747
Name: count, dtype: int64


In [15]:
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
import re

ps = PorterStemmer()

def transform_text(text):
    # Convert to lowercase
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    y = []
    for word in text.split():
        if word not in stopwords.words('english'):
            y.append(ps.stem(word))

    return " ".join(y)
df['transformed_message'] = df['Message'].apply(transform_text)
print(df.head())

  Category                                    Message      transformed_message
0      ham  Hello, I'd like to get in touch with you.  hello id like get touch
1     spam             WINNER!! You have won a prize!             winner prize
2      ham                     Hi there, how are you?                       hi


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_v = TfidfVectorizer()

X = tfidf_v.fit_transform(df['transformed_message']).toarray()
y = df['Category'].apply(lambda x: 1 if x == 'spam' else 0).values

print("Shape of X (features):", X.shape)
print("Shape of y (labels):", y.shape)


Shape of X (features): (3, 8)
Shape of y (labels): (3,)


In [14]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of training features (X_train):", X_train.shape)
print("Shape of testing features (X_test):", X_test.shape)
print("Shape of training labels (y_train):", y_train.shape)
print("Shape of testing labels (y_test):", y_test.shape)

Shape of training features (X_train): (2, 8)
Shape of testing features (X_test): (1, 8)
Shape of training labels (y_train): (2,)
Shape of testing labels (y_test): (1,)


In [18]:

model = MultinomialNB()

model.fit(X_train, y_train)


In [19]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model: {accuracy:.4f}")

# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Accuracy of the model: 1.0000
Confusion Matrix:
[[1]]




In [20]:
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.read_csv('mail_data.csv')

ps = PorterStemmer()
def transform_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    y = []
    for word in text.split():
        if word not in stopwords.words('english'):
            y.append(ps.stem(word))
    return " ".join(y)

# Apply the preprocessing
df['transformed_message'] = df['Message'].apply(transform_text)

# Feature extraction from Step 3
tfidf_v = TfidfVectorizer()
X = tfidf_v.fit_transform(df['transformed_message']).toarray()
y = df['Category'].apply(lambda x: 1 if x == 'spam' else 0).values

# Model training from Step 4
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)

# Now, we create the final prediction function
def predict_message(message):

    # 1. Preprocess the message
    transformed_message = transform_text(message)

    # 2. Vectorize the message using the same TF-IDF vectorizer
    vector_input = tfidf_v.transform([transformed_message])

    # 3. Make the prediction
    result = model.predict(vector_input)[0]

    # 4. Return the result as a human-readable string
    if result == 1:
        return "spam"
    else:
        return "ham"

# Example usage with a new message
test_message_1 = "WINNER!! You have won a free entry to our exclusive competition. Claim now!"
prediction_1 = predict_message(test_message_1)
print(f"The message '{test_message_1}' is predicted to be: {prediction_1}")

test_message_2 = "Hey, just wanted to check if you are free tomorrow for coffee."
prediction_2 = predict_message(test_message_2)
print(f"The message '{test_message_2}' is predicted to be: {prediction_2}")


The message 'WINNER!! You have won a free entry to our exclusive competition. Claim now!' is predicted to be: spam
The message 'Hey, just wanted to check if you are free tomorrow for coffee.' is predicted to be: ham


In [22]:
# Test your own message
my_new_message = "Your account has been suspended. Please click on the link to verify your identity."
my_prediction = predict_message(my_new_message)
print(f"The message '{my_new_message}' is predicted to be: {my_prediction}")

The message 'Your account has been suspended. Please click on the link to verify your identity.' is predicted to be: ham
