In [2]:
# Importing Dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer # Converting text data to numerical values
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
# Data Collection and Preprocessing
raw_mail_data = pd.read_csv("mail_data.csv")
raw_mail_data

In [10]:
# Replacing the null values with a null string
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)), '')

In [12]:
mail_data.shape

(5572, 2)

In [13]:
# Label Encoding: Labelling spam mail as 0 and ham(noon spam mails) as 1
mail_data.loc[mail_data['Category'] == 'spam', 'Category', ] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category', ] = 1


In [15]:
# Seperating the data as text and labels
X = mail_data['Message']
Y = mail_data['Category']

In [18]:
# Splitting the data into training data and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)
print(X.shape)
print(X_train.shape)
print(X_test.shape)



(5572,)
(4457,)
(1115,)


In [21]:
# Feature Extraction
# We transform the test data to feature vectors that can be used as input to the Logistic regression
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase='True')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test) # Only use transform so our dataset deosn't become biased

# Convert Y_train and Y_test values as integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')


In [26]:
# Training the Model
# Logistic Regression
model = LogisticRegression()
model .fit(X_train_features, Y_train) 

LogisticRegression()

In [30]:
# Evaluating the trained model
# Prediction on training data:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)
print('Accuracy on training data: ', accuracy_on_training_data)

Accuracy on training data:  0.9670181736594121


In [38]:
# Evaluating the trained model
# Prediction on test data:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print('Accuracy on test data: ', accuracy_on_test_data)

Accuracy on test data:  0.9659192825112107


In [47]:
# Building a predicting system
input_mail = ["You gotfree tcikets to the beach. Check the family whatsapp!"]
input_data_features = feature_extraction.transform(input_mail)
prediction = model.predict(input_data_features)
if prediction[0] == '1':
    print('Not Spam')
else:
    print('Spam')

Spam
