In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
data = pd.read_csv('spam_ham_dataset.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [6]:
data.columns

Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')

In [7]:
data.drop(['Unnamed: 0', 'label_num'], axis = 1, inplace= True)
data.head()

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


In [8]:
data.isna().sum()

label    0
text     0
dtype: int64

In [9]:
data['label'] = data['label'].map({'ham' : 0, 'spam' : 1})
data.head()
                            

Unnamed: 0,label,text
0,0,Subject: enron methanol ; meter # : 988291\r\n...
1,0,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,0,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,1,"Subject: photoshop , windows , office . cheap ..."
4,0,Subject: re : indian springs\r\nthis deal is t...


In [10]:
vectorizer = CountVectorizer(stop_words='english', analyzer='word')

In [11]:
X = data.loc[:,'text']
y = data.loc[:,'label']

In [12]:
X = vectorizer.fit_transform(X)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 0)

X_train = X_train.toarray()
X_test = X_test.toarray()

print(X_train.shape)
print(X_test.shape)

(4136, 50140)
(1035, 50140)


In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [15]:
param_grid = {'C': [1.0, 2.0, 3.0],
            'penalty' : ['l2'],
            'solver': ['lbfgs', 'liblinear']}

grid = GridSearchCV(LogisticRegression(), param_grid, cv = 4)

In [16]:
grid.fit(X_train, y_train)

GridSearchCV(cv=4, estimator=LogisticRegression(),
             param_grid={'C': [1.0, 2.0, 3.0], 'penalty': ['l2'],
                         'solver': ['lbfgs', 'liblinear']})

In [17]:
grid.best_params_

{'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}

In [18]:
grid.best_score_

0.9784816247582204

In [19]:
model = grid.best_estimator_
model.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

In [20]:
score_01 = model.score(X_train, y_train)
print("Le score d'entrainement est de ", score_01)

Le score d'entrainement est de  0.999274661508704


In [21]:
score_02 = model.score(X_test, y_test)
print("Le score d'entrainement est de ", score_02)

Le score d'entrainement est de  0.9797101449275363


In [22]:
type(X_test)

numpy.ndarray

In [27]:
def spam_classifier (vect, mod, mail):
    mail_to_test = pd.Series([mail])
    neo = vect.transform(mail_to_test).toarray()
    test = mod.predict(neo)
    
    print(test)
    

In [30]:
mail = "enron methanol ; meter # : 988291\r\n"
spam_classifier(vectorizer, model, mail)

[0]


In [25]:
import pickle

pickle.dump(model, open("Model.sav", "wb"))
pickle.dump(vectorizer, open("Vectorizer.sav", "wb"))