In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
mail = pd.read_csv('mail_data.csv')
mail.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
mail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
maildata = mail.where((pd.notnull(mail)),'')

In [6]:
maildata.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
maildata.loc[maildata['Category']== 'spam','Category',]=0
maildata.loc[maildata['Category']== 'ham','Category',]=1

In [8]:
x = maildata['Message']
y = maildata['Category']

In [9]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=3)

In [12]:
# Feature feature_extraction
# Transform the text data to feature vectors that can be used as input to the logis regrsn

feature_extraction = TfidfVectorizer(min_df = 1 , stop_words='english',lowercase=True)

xtrain_features = feature_extraction.fit_transform(xtrain)
xtest_features = feature_extraction.transform(xtest)

ytrain = ytrain.astype('int')
ytest = ytest.astype('int')

In [13]:
print(xtrain_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

In [14]:
model = LogisticRegression()

In [16]:
model.fit(xtrain_features,ytrain)

In [17]:
trainpred = model.predict(xtrain_features)
trainacc = accuracy_score(ytrain,trainpred)

In [18]:
print('Accuracy on training data :',trainacc)

Accuracy on training data : 0.9670181736594121


In [21]:
testpred = model.predict(xtest_features)
testacc = accuracy_score(ytest,testpred)
print('Accuracy on testing data :',testacc)

Accuracy on testing data : 0.9659192825112107


In [22]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

inputdatafeatures = feature_extraction.transform(input_mail)
prediction = model.predict(inputdatafeatures)

if(prediction[0]==1):
  print('Ham mail')
else:
  print('Spam mail')

Ham mail
