In [None]:
import string
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from nltk import tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Алексей\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Алексей\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [None]:
data = pd.read_csv('sms_spam.csv', usecols=[0, 1])
x_train, x_test, y_train, y_test = train_test_split(data.text, data.type , train_size = 0.7)

In [None]:
class spamClassifier:
  def __init__(self,
               lower_case = True,
               stop_words = {},
               ngram_range = (1,5),
               analyzer = 'char',
               max_df = 1,
               min_df = 1,
               max_features = 1000,
               Vectorizer = TfidfVectorizer):

     self.Vectorizer = Vectorizer(lowercase=lower_case,
                                       stop_words=stop_words,
                                       ngram_range=ngram_range,
                                       analyzer=analyzer,
                                       tokenizer=self.tokenize,
                                       max_df=max_df,
                                       min_df=min_df,
                                       max_features=max_features)
  def tokenize(self, text):
    text = ''.join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

  def fit(self, x_train, y_train):
    vect_x_train = self.Vectorizer.fit_transform(raw_documents = x_train)
    clf = MultinomialNB()
    clf.fit(vect_x_train, y_train)
    self.classifier = clf

  def predict(self, x_test):
    vect_x_test = self.Vectorizer.transform(x_test)
    return self.classifier.predict(vect_x_test)

In [None]:
class spamClassifierGrid:
  def __init__(self,
               low_case_set = [True],
               stop_words_set = [ stopwords.words('english')],
               analyzer_set = ['word', 'char' ],
               ngram_range_set = [[1, 5], [3, 10]],
               vectorizer_set = [TfidfVectorizer],
               min_df_set = np.linspace(0, 1, 5),
               max_df_set = np.linspace(0, 1, 5),
               max_features_set = np.linspace(1000, 20000, 5)
               ):
    self.classifier_set = []
    for case in low_case_set:
        for stWord in stop_words_set:
          for analyzer in analyzer_set:
            for n1 in ngram_range_set[0]:
              for n2 in ngram_range_set[1]:
                if (n1 < n2):
                  for vectorizer in vectorizer_set:
                    for minDf in min_df_set:
                      for maxDf in max_df_set:
                        if (minDf <= maxDf):
                          for max_feature in max_features_set:
                            self.classifier_set.append(spamClassifier(
                                  lower_case=case,
                                  stop_words=stWord,
                                  ngram_range=(n1, n2),
                                  analyzer=analyzer,
                                  max_df=maxDf,
                                  min_df=minDf,
                                  max_features=int(max_feature),
                                  Vectorizer=vectorizer))


  def fit(self, x_train, y_train):
    fitted_list = []
    for classifier in self.classifier_set:
      try:
        classifier.fit(x_train, y_train)
        fitted_list.append(classifier)
      except:
        pass
    self.classifier_set = fitted_list

  def predict(self, x_test):
    self.predicted = []
    for classifier in  self.classifier_set:
      self.predicted.append(classifier.predict(x_test))
    return self.predicted

  def getReport(self, x_test, y_test):
    report = pd.DataFrame(columns = ['Low word case',
                                     'Stop words',
                                     'n-gramm type',
                                     'n-gramm range',
                                     'max_df',
                                     'min_df',
                                     'max features',
                                     'Vectorizer',
                                     'precision',
                                     'recall',
                                     'f1-score',
                                     'accuracy'])
    for i in range(len(self.classifier_set)):
      params = self.classifier_set[i].Vectorizer.get_params()
      pred = self.classifier_set[i].predict(x_test)
      report.loc[i] = {'Low word case' : params.get('lowercase'),
                       'Stop words': (params.get('stop_words') != {}),
                       'n-gramm type' : params.get('analyzer'),
                       'n-gramm range' : params.get('ngram_range'),
                       'max_df' : params.get('max_df'),
                       'min_df' : params.get('min_df'),
                       'max features' : params.get('max_features'),
                       'Vectorizer' : str(type(self.classifier_set[0].
                                               Vectorizer)).split('.')[-1].
                                               split('\'')[0],
                       'precision' : precision_score(y_test,
                                                     pred,
                                                     average='macro'),
                       'recall' : recall_score(y_test,
                                               pred,
                                               average='macro'),
                       'f1-score' : f1_score(y_test,
                                             pred,
                                             average='macro'),
                       'accuracy' : accuracy_score(y_test,
                                                   pred)}
    return report

In [None]:
example = spamClassifierGrid()
example.fit(x_train, y_train)
result = example.getReport(x_test, y_test)

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
result.sample(20)

Unnamed: 0,Low word case,Stop words,n-gramm type,n-gramm range,max_df,min_df,max features,Vectorizer,precision,recall,f1-score,accuracy
32,True,True,char,"(1, 3)",1.0,0.25,10500,TfidfVectorizer,0.43425,0.5,0.464811,0.8685
3,True,True,char,"(1, 3)",0.25,0.0,15250,TfidfVectorizer,0.975407,0.889877,0.926802,0.969516
112,True,True,char,"(5, 10)",0.75,0.0,10500,TfidfVectorizer,0.948705,0.925968,0.936914,0.971907
30,True,True,char,"(1, 3)",1.0,0.25,1000,TfidfVectorizer,0.43425,0.5,0.464811,0.8685
48,True,True,char,"(1, 3)",1.0,0.75,15250,TfidfVectorizer,0.43425,0.5,0.464811,0.8685
11,True,True,char,"(1, 3)",0.75,0.0,5750,TfidfVectorizer,0.97544,0.921007,0.945911,0.976689
25,True,True,char,"(1, 3)",0.75,0.25,1000,TfidfVectorizer,0.43425,0.5,0.464811,0.8685
57,True,True,char,"(1, 10)",0.5,0.0,10500,TfidfVectorizer,0.966632,0.948487,0.957304,0.980873
96,True,True,char,"(1, 10)",1.0,0.75,5750,TfidfVectorizer,0.43425,0.5,0.464811,0.8685
23,True,True,char,"(1, 3)",0.5,0.25,15250,TfidfVectorizer,0.43425,0.5,0.464811,0.8685


In [None]:
result.iloc[result['precision'].argmax()]

Low word case               True
Stop words                  True
n-gramm type                char
n-gramm range            (1, 10)
max_df                      0.25
min_df                         0
max features               20000
Vectorizer       TfidfVectorizer
precision               0.986096
recall                  0.940221
f1-score                0.961568
accuracy                0.983264
Name: 54, dtype: object

In [None]:
result.iloc[result['recall'].argmax()]

Low word case               True
Stop words                  True
n-gramm type                char
n-gramm range            (1, 10)
max_df                      0.25
min_df                         0
max features               10500
Vectorizer       TfidfVectorizer
precision               0.967554
recall                  0.953033
f1-score                 0.96013
accuracy                0.982068
Name: 52, dtype: object

In [None]:
result.iloc[result['f1-score'].argmax()]


Low word case               True
Stop words                  True
n-gramm type                char
n-gramm range             (1, 3)
max_df                      0.25
min_df                         0
max features                1000
Vectorizer       TfidfVectorizer
precision               0.982621
recall                  0.948624
f1-score                0.964748
accuracy                0.984459
Name: 0, dtype: object

In [None]:
result.iloc[result['accuracy'].argmax()]

Low word case               True
Stop words                  True
n-gramm type                char
n-gramm range             (1, 3)
max_df                      0.25
min_df                         0
max features                1000
Vectorizer       TfidfVectorizer
precision               0.982621
recall                  0.948624
f1-score                0.964748
accuracy                0.984459
Name: 0, dtype: object