In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
dataset = pd.read_csv('data/mail_data.csv')

In [3]:
dataset.head(3)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
dataset.shape

(5572, 2)

In [6]:
dataset.where((pd.notnull(dataset)), '')

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [8]:
dataset.loc[dataset['Category'] == 'spam', 'Category',] = 0
dataset.loc[dataset['Category'] == 'ham', 'Category',] = 1

In [23]:
dataset.head(7)

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
5,0,FreeMsg Hey there darling it's been 3 week's n...
6,1,Even my brother is not like to speak with me. ...


In [10]:
dataset['Message'][1]

'Ok lar... Joking wif u oni...'

In [11]:
x = dataset['Message']
y = dataset['Category']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [13]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

In [14]:
x_train_feature = feature_extraction.fit_transform(x_train)
x_test_feature = feature_extraction.transform(x_test)
y_train_feature = y_train.astype('int')
y_test_feature = y_test.astype('int')

In [15]:
lr = LogisticRegression()

In [16]:
lr.fit(x_train_feature, y_train_feature)

In [17]:
lr.score(x_test_feature, y_test_feature)*100, lr.score(x_train_feature, y_train_feature)*100

(96.7713004484305, 96.70181736594121)

In [18]:
accuracy_score(y_train_feature, lr.predict(x_train_feature))

0.9670181736594121

In [19]:
accuracy_score(y_test_feature, lr.predict(x_test_feature))

0.967713004484305

In [40]:
input_mail = ['ur debit card expired please click on the given link below to renew it']
print(input_mail)
if lr.predict(feature_extraction.transform(input_mail)) == 0:
    print('spam mail')
else:
    print('not spam')

['ur debit card expired please click on the given link below to renew it']
not spam


In [39]:
dataset['Message'][2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [31]:
loaded_package = joblib.load('./all_spam_mail_model.pkl')
spamModel = loaded_package['model']
spamvectorizer = loaded_package['vectorizer']

In [37]:
if spamModel.predict(spamvectorizer.transform(input_mail)) == 0:
    print('spam mail')
else:
    print('not spam')

spam mail
