Importing the Dependencies

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
Data = pd.read_csv("/content/mail_data.csv")
Data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
Data.isnull().sum()

Unnamed: 0,0
Category,0
Message,0


Labeling the data

In [8]:
Data.loc[Data['Category'] == 'spam', 'Category'] = 0
Data.loc[Data['Category'] == 'ham', 'Category'] = 1

In [9]:
Data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
X = Data['Message']
Y = Data['Category']

In [11]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=2)

In [12]:
print(X.shape,X_train.shape,X_test.shape)

(5572,) (4457,) (1115,)


Features Extraction

In [13]:
#transform the text data to feature vector that can be input to the logistic regression
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english',lowercase=bool(True))

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

#convert the Y values as integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [14]:
model = LogisticRegression()
model.fit(X_train_features, Y_train)
Y_pred = model.predict(X_train_features)
accuracy = accuracy_score(Y_train, Y_pred)
print(accuracy)

0.9685887368184878


In [15]:
Y_pred = model.predict(X_test_features)
accuracy = accuracy_score(Y_test, Y_pred)
print(accuracy)

0.9533632286995516


In [17]:
Input_data = ["Sorry to be a pain. Is it ok if we meet another night? I spent late afternoon in casualty and that means i haven't done any of y stuff42moro and that includes all my time sheets and that. Sorry."]

# Convert text to feature vector
Input_data_feature = feature_extraction.transform(Input_data)

#Make Prediction
prediction = model.predict(Input_data_feature)
print(prediction)
if(prediction[0]==1):
  print("Ham Mail")
else:
  print("Spam Mail")

[1]
Ham Mail
