### IMPORTING LIBRARIES

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pickle

print("Libraries Imported")

Libraries Imported


### READING DATA

In [2]:
data = pd.read_csv('emails.csv')
data.head(1)

Unnamed: 0,text,spam,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 100,Unnamed: 101,Unnamed: 102,Unnamed: 103,Unnamed: 104,Unnamed: 105,Unnamed: 106,Unnamed: 107,Unnamed: 108,Unnamed: 109
0,Subject: naturally irresistible your corporate...,1,,,,,,,,,...,,,,,,,,,,


### CLEANING DATA

In [3]:
data = data.iloc[:,0:2]

In [4]:
data.head(5)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [5]:
data.isna().sum()

text    0
spam    2
dtype: int64

In [6]:
data.dropna(inplace=True)

In [7]:
data.duplicated().sum()

33

In [8]:
data.drop_duplicates(inplace=True)

### DATA PREPROCESSING
#### 1. convert to small letters

In [9]:
data['text'] = data['text'].apply(lambda x:x.lower())

In [10]:
data.head()

Unnamed: 0,text,spam
0,subject: naturally irresistible your corporate...,1
1,subject: the stock trading gunslinger fanny i...,1
2,subject: unbelievable new homes made easy im ...,1
3,subject: 4 color printing special request add...,1
4,"subject: do not have money , get software cds ...",1


#### 2. remove special chars

In [11]:
def remove_special_chars(text):
    result = ""
    for i in text:
        if i.isalnum() or i ==' ':
            result = result + i
        else:
            result = result + ''
    return result.strip()

In [12]:
data['text'] = data['text'].apply(remove_special_chars)

#### 3. remove stopwords 
#### 4. convert to vector

In [13]:
cv = CountVectorizer(stop_words = 'english', max_features = 10000)

#### CHECKING FOR LABELS' TYPE

In [14]:
data['spam'].value_counts()

0                                                                                               4326
1                                                                                               1368
 mr suresh prabhu                                                                                  1
 its termination would not  have such a phenomenal impact on the power situation .  however        1
Name: spam, dtype: int64

In [15]:
def keep_numeric(value):
    result = ""
    for i in value:
        if i.isnumeric():
            result = result + i
        else:
            result = None
    return result
data['spam'] = data['spam'].apply(keep_numeric)

In [16]:
data['spam'].value_counts()

0    4326
1    1368
Name: spam, dtype: int64

In [17]:
data.isna().sum()

text    0
spam    2
dtype: int64

In [18]:
data.dropna(inplace=True)

### EXTRACTING X and y

In [19]:
X = cv.fit_transform(data['text']).toarray()

In [20]:
y = data['spam'].values

In [21]:
X.shape

(5694, 10000)

In [22]:
y.shape

(5694,)

### CREATING TRAIN AND TEST SET

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

### CREATING MODEL

In [24]:
clf = MultinomialNB()
clf.fit(X_train,y_train)

MultinomialNB()

### EVALUATING THE MODEL

In [25]:
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.9912203687445127

### SAVING THE MODEL

In [29]:
pickle.dump(clf, open('models/clf.sav', 'wb'))
pickle.dump(cv, open('models/cv.sav', 'wb'))