# Step 01 - Import Statements

In [1]:
import pandas as pd
import numpy as np
import nltk
import joblib

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# Step 02 - Data Preprocessing

In [2]:
df = pd.read_csv('./mail_data.csv')

In [3]:
# checking null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
mail_data = df.where((pd.notnull(df)), '')

In [5]:
mail_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
# checking row and columns
mail_data.shape

(5572, 2)

In [7]:
mail_data['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [8]:
# label encoder
mail_data.replace({'Category':{'ham':1, 'spam':0}}, inplace=True)

In [9]:
mail_data['Category'].value_counts()

1    4825
0     747
Name: Category, dtype: int64

In [10]:
x = mail_data["Message"]
y = mail_data['Category'].values

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [12]:
def clean_mail(x):
    stopword = set(stopwords.words('english'))
    stem = PorterStemmer()
    
    mail = word_tokenize(x)
    word = [stem.stem(text.lower()) for text in mail if text.isalpha() and text.lower() not in set(stopwords.words('english'))]
    return ' '.join(word)

In [13]:
x = x.apply(clean_mail)

In [14]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

In [15]:
feature_extraction.fit(x)

In [16]:
x_train= feature_extraction.transform(x_train)
x_test = feature_extraction.transform(x_test)

# Step 03 - Creating Model

In [17]:
model = LogisticRegression()

In [18]:
model.fit(x_train, y_train)

## Evaluate the Model

In [19]:
pred = model.predict(x_test)

In [20]:
print(accuracy_score(y_test, pred))

0.9515695067264573


## Saving the model

In [21]:
joblib.dump(model, './model.pkl')
joblib.dump(feature_extraction, 'feature_extraction.pkl')

['feature_extraction.pkl']

**Building the Predictive System**

In [22]:
def mail_filter(email):
    feature_extraction = joblib.load('./feature_extraction.pkl')
    model = joblib.load('./model.pkl')
    
    def clean_mail(x):
        stopword = set(stopwords.words('english'))
        stem = PorterStemmer()

        mail = word_tokenize(x)
        word = [stem.stem(text.lower()) for text in mail if text.isalpha() and text.lower() not in set(stopwords.words('english'))]
        return ' '.join(word)
    
    mail = clean_mail(email)
    words_list = feature_extraction.transform([mail])

    
    result = model.predict(words_list)[0]
    
    if result == 1:
        print('Not Spam')
    else:
        print("Spam")
    