Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Loading the dataset

In [3]:
emailSpam_data = pd.read_csv('data/mail_data.csv')

In [4]:
emailSpam_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
emailSpam_data.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
emailSpam_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [7]:
emailSpam_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [8]:
emailSpam_data.shape

(5572, 2)

In [9]:
emailSpam_data.loc[emailSpam_data['Category'] == 'spam', "Category"]=[0] 
emailSpam_data.loc[emailSpam_data['Category'] == 'ham', "Category"]=[1] 

In [10]:
X = emailSpam_data['Message']
y = emailSpam_data['Category']

In [12]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [13]:
y.head()

0    1
1    1
2    0
3    1
4    1
Name: Category, dtype: object

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
print(X.shape, y.shape)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(5572,) (5572,)
(4457,) (4457,)
(1115,) (1115,)


In [17]:
feature_extractor = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

In [27]:
X_train_features = feature_extractor.fit_transform(X_train)
X_test_features = feature_extractor.transform(X_test)
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [28]:
print(X_train_features[5])

  (np.int32(0), np.int32(5976))	0.4603016868709847
  (np.int32(0), np.int32(4467))	0.44848134154502717
  (np.int32(0), np.int32(7079))	0.3613637839078621
  (np.int32(0), np.int32(6352))	0.5052400868057315
  (np.int32(0), np.int32(5911))	0.44848134154502717


In [29]:
model = LogisticRegression()
model.fit(X_train_features, y_train)

In [32]:
pred = model.predict(X_train_features)

In [33]:
acc = accuracy_score(y_train, pred)
print(f"Accuracy: {acc * 100:.2f}%")

Accuracy: 96.70%


In [25]:
pred_test = model.predict(X_test_features)
acc_test = accuracy_score(y_test, pred_test)
print(f"Test Accuracy: {acc_test * 100:.2f}%")

Test Accuracy: 96.77%


In [35]:
userInput = ["This is a test email to check the spam classifier. Please do not ignore this message."]
input_features = feature_extractor.transform(userInput)
prediction = model.predict(input_features)
if prediction==0:
    print("The email is classified as SPAM.")
print("The email is classified as HAM.")

The email is classified as HAM.
