# Email spam classifier

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df=pd.read_csv('mail_data.csv')

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
df.shape

(5572, 2)

In [6]:
df=df.where((pd.notnull(df)),'')

In [7]:
df.loc[df['Category']=='ham','Category',]=1
df.loc[df['Category']=='spam','Category',]=0

In [8]:
X=df['Message']
Y=df['Category']

In [9]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [10]:
Y.head()

0    1
1    1
2    0
3    1
4    1
Name: Category, dtype: object

In [11]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=3)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(4457,)
(1115,)
(4457,)
(1115,)


In [15]:
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english')

X_train_features=feature_extraction.fit_transform(X_train)
X_test_features=feature_extraction.transform(X_test)

Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')


In [19]:
print(X_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.5092725360510079
  (2, 3156)	0.41072393183126976
  (2, 2404)	0.4528771107060674
  (2, 6601)	0.6056811524587516
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (

In [20]:
model=LogisticRegression()

In [21]:
model.fit(X_train_features,Y_train)

In [23]:
prediction_on_train_set=model.predict(X_train_features)
accuracy_on_train_set=accuracy_score(Y_train,prediction_on_train_set)

In [25]:
print('accuracy on train data: ',accuracy_on_train_set)

accuracy on train data:  0.9670181736594121


In [26]:
prediction_on_test_set=model.predict(X_test_features)
accuracy_on_train_set=accuracy_score(Y_test,prediction_on_test_set)

In [29]:
print('accuracy on test data: ',accuracy_on_train_set)

accuracy on test data:  0.9659192825112107


# Email spam detection generic report

In [54]:
input_mail=['''

Eat Well Without Breaking the Bank! Claim Your 18 Free Meals, plus 3 Special Gifts from HelloFresh Now!
MAXIMUM FLAVOR. MINIMUM STRESS.
18 Free Meals
First Box Ships Free + 3 Surprise Gifts

GET COOKING


18 Free Meals
First Box Ships Free + 3 Surprise Gifts

REDEEM SAVINGS

Offer only valid for new customers with qualifying auto-renewing subscription purchase. '18 Free Meals + 3 Surprise Gifts + Free Shipping on First Box' offer is based on a total discount applied over a 9-week period for a 2-person, 4-recipe subscription. Shipping fee applies on all deliveries after the first box. Discount may vary for other meal plans and sizes. '3 surprise gift' offer available after the purchase of your 4th, 6th and 8th boxes (total approx. value of 3 surprise gifts while supplies last). Not valid on premiums, meal upgrades, add-ons, taxes or shipping fees. May not be combined with gift cards or any other promotion. No cash value. Void outside the U.S. and where prohibited. Offer cannot be sold or otherwise bartered. HelloFresh has the right to end or modify any offer at any time. Additional restrictions may apply. See terms and conditions for more details.

You are receiving this email from a third party marketing partner to HelloFresh, not from HelloFresh directly. If you'd prefer not to receive advertisements from Hellofresh Affiliate Partners, you can unsubscribe here or write to: Hellofresh Affiliate Department - 28 Liberty Street, New York, NY''']
input_data_features=feature_extraction.transform(input_mail)
prediction=model.predict(input_data_features)

In [55]:
if prediction==1:
    print("It's not a spam mail.")
else:
    print("It's a spam mail.")

It's a spam mail.
