<a href="https://colab.research.google.com/github/Nsimaar99/Kaggle-Project/blob/master/Email.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the dependencies

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

Data collection and pre-processing

In [22]:
# loading the dataset
raw_email_data = pd.read_csv('/content/spam_ham_dataset.csv')

In [23]:
# print the first five rows of the dataset
raw_email_data.head()

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


In [24]:
# print the last five rows of the dataset
raw_email_data.tail()

Unnamed: 0,label,text
5166,ham,Subject: put the 10 on the ft\r\nthe transport...
5167,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168,ham,Subject: calpine daily gas nomination\r\n>\r\n...
5169,ham,Subject: industrial worksheets for august 2000...
5170,spam,Subject: important online banking alert\r\ndea...


In [25]:
# Number of rows and columns of the dataset
raw_email_data.shape

(5171, 2)

In [26]:
# Check for missing data
raw_email_data.isnull().sum()

Unnamed: 0,0
label,0
text,0


In [27]:
# Checking for statistical measures
raw_email_data.describe()

Unnamed: 0,label,text
count,5171,5171
unique,2,4993
top,ham,Subject: calpine daily gas nomination\r\n>\r\n...
freq,3672,20


In [28]:
# lable encoding
raw_email_data.loc[raw_email_data['label'] == 'spam', 'label'] = 0
raw_email_data.loc[raw_email_data['label'] == 'ham', 'label'] = 1

In [29]:
print(raw_email_data)

     label                                               text
0        1  Subject: enron methanol ; meter # : 988291\r\n...
1        1  Subject: hpl nom for january 9 , 2001\r\n( see...
2        1  Subject: neon retreat\r\nho ho ho , we ' re ar...
3        0  Subject: photoshop , windows , office . cheap ...
4        1  Subject: re : indian springs\r\nthis deal is t...
...    ...                                                ...
5166     1  Subject: put the 10 on the ft\r\nthe transport...
5167     1  Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168     1  Subject: calpine daily gas nomination\r\n>\r\n...
5169     1  Subject: industrial worksheets for august 2000...
5170     0  Subject: important online banking alert\r\ndea...

[5171 rows x 2 columns]


In [30]:
# seperating the dataset into features and label
X = raw_email_data['text']
Y = raw_email_data['label']

In [33]:
# train and test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [34]:
print(X.shape, X_train.shape, X_test.shape)

(5171,) (4136,) (1035,)


Feature Extraction

In [51]:
# Transform the text data into numerical vectors so that they can be used as input to the logistic regression model
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# Convert Y_train and Y_test values as integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

Model Training with Logistic Regression

In [52]:
# Train the model with the training data
model = LogisticRegression()
model.fit(X_train_features, Y_train)

Model Evaluation

In [53]:
# Accuracy on training data
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)
print('Accuracy on training data: ', accuracy_on_training_data)

Accuracy on training data:  0.9968568665377177


In [54]:
# Accuracy on the test data
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print('Accuracy on test data: ', accuracy_on_test_data)

Accuracy on test data:  0.9806763285024155


In [59]:
# Define the prediction function for spam detection
def predict_spam(email_text, model, vectorizer):
    """
    Predict whether an email is spam or not.

    Args:
        email_text (str): The email content to classify.
        model (LogisticRegression): The trained classification model.
        vectorizer (TfidfVectorizer): The fitted vectorizer to transform email content.

    Returns:
        str: 'Spam' if classified as spam, otherwise 'Not Spam'.
    """
    # Transform the input email into the vectorized format
    email_transformed = vectorizer.transform([email_text])

    # Make a prediction
    prediction = model.predict(email_transformed)

    # Return a message based on the prediction result
    if prediction[0] == 0:
        return 'The email is spam'
    else:
        return 'The email is not spam'

In [60]:
# Provided email content
input_email = """Subject: neon retreat
ho ho ho , we ' re around to that most wonderful time of the year - - - neon leaders retreat time !
i know that this time of year is extremely hectic , and that it ' s tough to think about anything past the holidays , but life does go on past the week of december 25 through january 1 , and that ' s what i ' d like you to think about for a minute .
on the calender that i handed out at the beginning of the fall semester , the retreat was scheduled for the weekend of january 5 - 6 . but because of a youth ministers conference that brad and dustin are connected with that week , we ' re going to change the date to the following weekend , january 12 - 13 . now comes the part you need to think about .
i think we all agree that it ' s important for us to get together and have some time to recharge our batteries before we get to far into the spring semester , but it can be a lot of trouble and difficult for us to get away without kids , etc . so , brad came up with a potential alternative for how we can get together on that weekend , and then you can let me know which you prefer .
the first option would be to have a retreat similar to what we ' ve done the past several years . this year we could go to the heartland country inn ( www . . com ) outside of brenham . it ' s a nice place , where we ' d have a 13 - bedroom and a 5 - bedroom house side by side . it ' s in the country , real relaxing , but also close to brenham and only about one hour and 15 minutes from here . we can golf , shop in the antique and craft stores in brenham , eat dinner together at the ranch , and spend time with each other . we ' d meet on saturday , and then return on sunday morning , just like what we ' ve done in the past .
the second option would be to stay here in houston , have dinner together at a nice restaurant , and then have dessert and a time for visiting and recharging at one of our homes on that saturday evening . this might be easier , but the trade off would be that we wouldn ' t have as much time together . i ' ll let you decide .
email me back with what would be your preference , and of course if you ' re available on that weekend . the democratic process will prevail - - majority vote will rule ! let me hear from you as soon as possible , preferably by the end of the weekend . and if the vote doesn ' t go your way , no complaining allowed ( like i tend to do ! )
have a great weekend , great golf , great fishing , great shopping , or whatever makes you happy !
bobby"""

# Call the function with the model and vectorizer
result = predict_spam(input_email, model, feature_extraction)

# Output the result
print(result)  # This will print either 'Spam' or 'Not Spam'



The email is not spam
