In [36]:
# Import dependencies

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Data Collection and Preprocessing

In [2]:
mails = pd.read_csv('/content/mail_data.csv')

In [3]:
mails.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
mails.shape

(5572, 2)

In [5]:
# Replace all the null values with a null string
mail_data = mails.where((pd.notnull(mails)),'')

In [6]:
mail_data.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
mail_data.shape

(5572, 2)

In [8]:
# label Encoding of 'Ham' -> 1 and 'Spam' -> 0
mail_data.loc[mail_data['Category'] == 'spam', 'Category']=0
mail_data.loc[mail_data['Category'] == 'ham', 'Category']=1

Spam will be represented as 0 and ham will be 1

In [9]:
mail_data

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


# Seperating the data as texts and labels

In [10]:
X = mail_data['Message']
Y = mail_data['Category']

In [11]:
print(X)
print(Y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object
0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


# Seperating the data into train and test

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state = 3)

In [13]:
print(X_train.shape)
print(X_test.shape)

(4457,)
(1115,)


# Feature Extraction

In [19]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [16]:
# Convert all the Y_train and T_test values as integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [20]:
print(X_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

# Training the Logistic Regression Model

In [21]:
model = LogisticRegression()

In [22]:
# Training the logistic Regression model
model.fit(X_train_features,Y_train)

# Evaluating the training data

In [24]:
# Prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

print('accuracy_on_training_data : ', accuracy_on_training_data)

accuracy_on_training_data :  0.9670181736594121


In [25]:
# Prediction on test data

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

print('accuracy_on_training_data : ', accuracy_on_test_data)

accuracy_on_training_data :  0.9659192825112107


We check accuracy score on training and test data to ensure there is no any overfitting

In [27]:
# Building a predictive system

input_mail = ["I HAVE A DATE ON SUNDAY WITH WILL!!"]

# Convert this input mail into feature vector/numerical value
input_data_features = feature_extraction.transform(input_mail)

# Making prediction

prediction = model.predict(input_data_features)
print(prediction)

if prediction[0] == 1:
  print('The given mail is a correct mail or HAM')

else:
  print('This mail is spam')

[1]
The given mail is a correct mail or HAM


In [33]:
def predict_email_type(email, model, feature_extraction):
    """
    Predict whether the given email is spam or ham.

    Parameters:
    email (str): The email text to be classified.
    model: The trained machine learning model for prediction.
    feature_extraction: The feature extraction method to convert email text into numerical features.

    Returns:
    str: 'HAM' if the email is not spam, 'SPAM' otherwise.
    """
    # Convert the email into a feature vector
    input_data_features = feature_extraction.transform([email])

    # Make a prediction
    prediction = model.predict(input_data_features)

    # Interpret the prediction
    if prediction[0] == 1:
        return 'HAM'
    else:
        return 'SPAM'

In [35]:
prediction = predict_email_type("I have to take exam with march 3", model, feature_extraction)
print(prediction)

HAM
