In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df=pd.read_csv('mail_data.csv')

In [3]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [5]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data=df.where((pd.notnull(df)),'')  
# df.where(condition, other):
#The where() function in Pandas allows you to replace values in a DataFrame where a condition is True.

#In this case, it's being used to replace the values in the original DataFrame
#  df with an empty string ('') wherever the condition (pd.notnull(df)) is False.
#  In other words, it replaces null values with empty strings.

In [7]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
data.info

<bound method DataFrame.info of      Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
5        spam  FreeMsg Hey there darling it's been 3 week's n...
6         ham  Even my brother is not like to speak with me. ...
7         ham  As per your request 'Melle Melle (Oru Minnamin...
8        spam  WINNER!! As a valued network customer you have...
9        spam  Had your mobile 11 months or more? U R entitle...
10        ham  I'm gonna be home soon and i don't want to tal...
11       spam  SIX chances to win CASH! From 100 to 20,000 po...
12       spam  URGENT! You have won a 1 week FREE membership ...
13        ham  I've been searching for the right words to 

In [9]:
data.shape

(5572, 2)

In [10]:
data.loc[data['Category']=='spam','Category',]=0 
data.loc[data['Category']=='ham','Category',]=1
#data.loc[]: This is used to access a group of rows and columns in a DataFrame.
#  In this case, it's accessing specific rows and the 'Category' column.
#  these two lines of code are converting categorical labels ('spam' and 'ham')
#  in the 'Category' column to numerical labels (0 and 1).
#  This kind of transformation is common in machine learning when you want to represent
#  categorical data in a format that can be used by algorithms.

In [11]:
x=data['Message']
y=data['Category']

In [12]:
print(x)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
5       FreeMsg Hey there darling it's been 3 week's n...
6       Even my brother is not like to speak with me. ...
7       As per your request 'Melle Melle (Oru Minnamin...
8       WINNER!! As a valued network customer you have...
9       Had your mobile 11 months or more? U R entitle...
10      I'm gonna be home soon and i don't want to tal...
11      SIX chances to win CASH! From 100 to 20,000 po...
12      URGENT! You have won a 1 week FREE membership ...
13      I've been searching for the right words to tha...
14                    I HAVE A DATE ON SUNDAY WITH WILL!!
15      XXXMobileMovieClub: To use your credit, click ...
16                             Oh k...i'm watching here:)
17      Eh u r

In [13]:
print(y)

0       1
1       1
2       0
3       1
4       1
5       0
6       1
7       1
8       0
9       0
10      1
11      0
12      0
13      1
14      1
15      0
16      1
17      1
18      1
19      0
20      1
21      1
22      1
23      1
24      1
25      1
26      1
27      1
28      1
29      1
30      1
31      1
32      1
33      1
34      0
35      1
36      1
37      1
38      1
39      1
40      1
41      1
42      0
43      1
44      1
45      1
46      1
47      1
48      1
49      1
50      1
51      1
52      1
53      1
54      0
55      1
56      0
57      1
58      1
59      1
60      1
61      1
62      1
63      1
64      1
65      0
66      1
67      0
68      0
69      1
70      1
71      1
72      1
73      1
74      1
75      1
76      1
77      1
78      1
79      1
80      1
81      1
82      1
83      1
84      1
85      1
86      1
87      1
88      1
89      1
90      1
91      1
92      1
93      0
94      1
95      0
96      1
97      1
98      1
99      1


In [14]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=3)

In [15]:
print(x.shape)
print(x_train.shape)
print(x_test.shape)

(5572,)
(4457,)
(1115,)


In [16]:
print(y.shape)
print(y_train.shape)
print(y_test.shape)

(5572,)
(4457,)
(1115,)


In [17]:
feature_extraction=TfidfVectorizer(min_df = 1,stop_words = 'english',lowercase =True)

x_train_features=feature_extraction.fit_transform(x_train)
x_test_features=feature_extraction.transform(x_test)

y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [18]:
print(x_train)

3075                  Don know. I did't msg him recently.
1787    Do you know why god created gap between your f...
1614                         Thnx dude. u guys out 2nite?
4304                                      Yup i'm free...
3266    44 7732584351, Do you want a New Nokia 3510i c...
2413    I don't know u and u don't know me. Send CHAT ...
4539     Dare i ask... Any luck with sorting out the car?
3000    Oh, then your phone phoned me but it disconnected
2433    Really dun bluff me leh... U sleep early too. ...
2403    Oh oh... Wasted... Den muz chiong on sat n sun...
5151    No problem with the renewal. I.ll do it right ...
4294    You best watch what you say cause I get drunk ...
4423              MMM ... Fuck .... Merry Christmas to me
4235    Now only i reached home. . . I am very tired n...
2577                 In sch but neva mind u eat 1st lor..
1361    Yo dude guess who just got arrested the other day
840     Last chance 2 claim ur £150 worth of discount ...
4977    You ar

In [19]:
print(x_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

In [20]:
model=LogisticRegression()

In [21]:
model.fit(x_train_features,y_train)

In [22]:
prediction_on_training_data=model.predict(x_train_features)
accuracy_on_training_data=accuracy_score(y_train,prediction_on_training_data)

In [23]:
print('Accuracy on training data:',accuracy_on_training_data)

Accuracy on training data: 0.9670181736594121


In [24]:
prediction_on_test_data=model.predict(x_test_features)
accuracy_on_test_data=accuracy_score(y_test,prediction_on_test_data)

In [25]:
print('Accuracy on test data:',accuracy_on_test_data)

Accuracy on test data: 0.9659192825112107


In [26]:
input_your_mail=["Thanks for your subscription to Ringtone UK your mobile will be charged £5/month Please confirm by replying YES or NO. If you reply NO you will not be charged"]
input_data_features=feature_extraction.transform(input_your_mail)
prediction=model.predict(input_data_features)
print(prediction)
if(prediction[0]==1):
    print("Ham Mail")
else:
    print("Spam Mail")    

[0]
Spam Mail
