In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
raw_mail_data = pd.read_csv("mail_data.csv")

In [3]:
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,Can we meet for coffee on Friday? Let me know....
1,ham,Order #295753 shipped. Arrives Friday. Total: ...
2,ham,Ok which your another number
3,spam,Student loan #463875 forgiveness $91412 approv...
4,ham,Order #470233 shipped. Arrives Friday. Total: ...


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [5]:
raw_mail_data.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [6]:
df = raw_mail_data.where((pd.notnull(raw_mail_data)), '')

In [7]:
df.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


In [8]:
df.shape

(13656, 2)

In [9]:
df['Category'] = df['Category'].map({'spam': 0, 'ham': 1})

In [10]:
df.head()

Unnamed: 0,Category,Message
0,1,Can we meet for coffee on Friday? Let me know....
1,1,Order #295753 shipped. Arrives Friday. Total: ...
2,1,Ok which your another number
3,0,Student loan #463875 forgiveness $91412 approv...
4,1,Order #470233 shipped. Arrives Friday. Total: ...


In [11]:
df.loc[df['Category'] == 'spam', 'Category',] = 0
df.loc[df['Category'] == 'ham', 'Category',] = 1

In [12]:
df.head()

Unnamed: 0,Category,Message
0,1,Can we meet for coffee on Friday? Let me know....
1,1,Order #295753 shipped. Arrives Friday. Total: ...
2,1,Ok which your another number
3,0,Student loan #463875 forgiveness $91412 approv...
4,1,Order #470233 shipped. Arrives Friday. Total: ...


In [13]:
X = df['Message']
Y = df['Category']

In [14]:
X

Unnamed: 0,Message
0,Can we meet for coffee on Friday? Let me know....
1,Order #295753 shipped. Arrives Friday. Total: ...
2,Ok which your another number
3,Student loan #463875 forgiveness $91412 approv...
4,Order #470233 shipped. Arrives Friday. Total: ...
...,...
13651,Why do you ask princess?
13652,"Bitcoin investment #182318: Deposit 0.5 BTC, w..."
13653,Student loan #791960 forgiveness $85578 approv...
13654,Just sent it. So what type of food do you like?


In [15]:
Y

Unnamed: 0,Category
0,1
1,1
2,1
3,0
4,1
...,...
13651,1
13652,0
13653,0
13654,1


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)

In [17]:
X_train.shape

(10924,)

In [18]:
y_train.shape

(10924,)

In [19]:
X_test.shape

(2732,)

In [20]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words="english", binary=True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13656 entries, 0 to 13655
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  13656 non-null  int64 
 1   Message   13656 non-null  object
dtypes: int64(1), object(1)
memory usage: 213.5+ KB


In [22]:
y_train = y_train.astype("int")
y_test = y_test.astype("int")

In [23]:
X_train

Unnamed: 0,Message
259,Order #753968 shipped. Arrives Tuesday. Total:...
443,Foreclosure #873172: Pay $79299 by Friday or p...
8384,Order #303844 shipped. Arrives Friday. Total: ...
10219,Student loan #830514 forgiveness $35102 approv...
8971,Did you stitch his trouser
...,...
5191,U r too much close to my heart. If u go away i...
13418,Nothin comes to my mind. Ü help me buy hanger ...
5390,:)
860,Your board is working fine. The issue of overh...


In [24]:
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 84830 stored elements and shape (10924, 17154)>
  Coords	Values
  (0, 14518)	0.21262052134312648
  (0, 7702)	0.6003209812822936
  (0, 15602)	0.2137041199346725
  (0, 10795)	0.21383284619364426
  (0, 16536)	0.18988250020565275
  (0, 16463)	0.21357565827251415
  (0, 6440)	0.5749808000803358
  (0, 16473)	0.21344746012164265
  (0, 14485)	0.21255736158369767
  (1, 12579)	0.26156109504257785
  (1, 9056)	0.5990600790731846
  (1, 14647)	0.21554235328537494
  (1, 8119)	0.5990600790731846
  (1, 12636)	0.17520891106085298
  (1, 14992)	0.26128329810186385
  (1, 10852)	0.26156109504257785
  (2, 14518)	0.2207100540900703
  (2, 15602)	0.22183488015220798
  (2, 10795)	0.22196850403479934
  (2, 16463)	0.22170153093349493
  (2, 16473)	0.22156845525183277
  (2, 14485)	0.22064449130322458
  (2, 12636)	0.18225786212628478
  (2, 2568)	0.6231612800741654
  (2, 3703)	0.5332263039029989
  :	:
  (10920, 14056)	0.29465481741404387
  (10920, 13024)	0.2

In [25]:
print(X_test_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 17763 stored elements and shape (2732, 17154)>
  Coords	Values
  (0, 10974)	0.3261968235768648
  (0, 11233)	0.3261968235768648
  (0, 11309)	0.28259689452479786
  (0, 12088)	0.22402352863864525
  (0, 12411)	0.3537053160694184
  (0, 12577)	0.38121380856197207
  (0, 13082)	0.3173410669955137
  (0, 13471)	0.18581306921182214
  (0, 13551)	0.12684991422773248
  (0, 13670)	0.1314508695498171
  (0, 13808)	0.21346613878850118
  (0, 15486)	0.31355855234473895
  (0, 16306)	0.2724398290436114
  (1, 8412)	0.7366210797938619
  (1, 10795)	0.26238260359000865
  (1, 12636)	0.21544179250740095
  (1, 14485)	0.26081752610657233
  (1, 14518)	0.2608950259968653
  (1, 15602)	0.262224650630112
  (1, 16463)	0.2620670223425098
  (1, 16473)	0.26190971739521174
  (2, 10678)	0.4796934729279806
  (2, 11344)	0.4933306670690381
  (2, 13551)	0.23283123967304586
  (2, 13670)	0.2412762286810641
  :	:
  (2728, 16007)	0.31679898264675016
  (2728, 16432)	0.24243

In [26]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_features, y_train)

In [27]:
prediction_train_data = model.predict(X_train_features)
accuracy_train_data = accuracy_score(y_train, prediction_train_data)

In [28]:
print("Accuarcy on train data: ", accuracy_train_data)

Accuarcy on train data:  0.9873672647381911


In [29]:
prediction_test_data = model.predict(X_test_features)
accuracy_test_data = accuracy_score(y_test, prediction_test_data)

In [30]:
print("Accuarcy on test data: ", accuracy_test_data)

Accuarcy on test data:  0.986822840409956


In [34]:
input_user_mail = ["Dear User We are thrilled to inform you that your email ID has been randomly selected to receive a limited-time reward worth $5000. To claim your prize, please click on the secure link below and fill out the verification form immediately:👉 Claim Your Reward Now This offer is valid only for 24 hours, so dont miss out! Confirm your details today to avoid disqualification. Sincerely,Rewards Department Global Sweepstakes Organization support@globalsweepstakes-offers.com"]

input_data_features = feature_extraction.transform(input_user_mail)

prediction = model.predict(input_data_features)

if prediction[0] == 1:
    print("This is a ham mail")
else:
    print("This is a spam mail")

This is a spam mail


In [32]:
import pickle
pickle.dump(model, open("logistic_regression.pkl", "wb"))
pickle.dump(feature_extraction, open("feature_extraction.pkl", "wb"))