In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import accuracy_score

In [35]:
ds = pd.read_csv("mail_data.csv")

In [36]:
ds.head(3)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [37]:
ds.isnull().sum()

Category    0
Message     0
dtype: int64

In [38]:
#label encoding 
# 0--> spam mail and 1-->ham mail
ds.loc[ds['Category'] =='spam','Category',] = 0
ds.loc[ds['Category'] =='ham','Category',] = 1

In [39]:
ds.head(3)

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...


In [40]:
#separating feature and target
X = ds['Message']
Y = ds['Category']

In [41]:
# train and test split
X_train,X_test,Y_train,Y_test =  train_test_split(X,Y,test_size=0.2,random_state=3)

feature extraction


In [42]:
#transform the text data into the vector feature so that it can be used as input to the logistic regression
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)
X_train_feature = feature_extraction.fit_transform(X_train)
X_test_feature = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [43]:
# training the model
model = LogisticRegression()


In [44]:
# training logistric regression with model data
model.fit(X_train_feature,Y_train)

In [45]:
#accuracy on training data
X_train_prediction = model.predict(X_train_feature)
training_data_accuracy = accuracy_score(Y_train,X_train_prediction)

In [46]:
print("Accuracy of trained data :" , training_data_accuracy)

Accuracy of trained data : 0.9676912721561588


In [47]:
#accuracy on testing data
X_test_prediction = model.predict(X_test_feature)
testing_data_accuracy = accuracy_score(Y_test,X_test_prediction)

In [48]:
print("Accuracy of testing data :" , testing_data_accuracy)

Accuracy of testing data : 0.9668161434977578


In [49]:

input_mail = ["Thanks for your subscription to Ringtone UK your mobile will be charged £5/month Please confirm by replying YES or NO. If you reply NO you will not be charged"]
# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

[0]
Spam mail
