# Ejercicio de Naive Bayes Multinomial

En la clase anterior, para medir la cantidad de artículos clasificados correctamente se utilizó el mismo subconjunto del dataset que se utilizó para entrenar.

Esta medida no es una medida del todo válida, ya que lo que interesa de un clasificador es su capacidad de clasificación de datos que no fueron utilizados para entrenar. Es decir, el valor de un clasificador viene de su capacidad de generalizar y predecir resultados correctos para datos nuevos.

Es por eso que se busca, para el clasificador entrenado con el subconjunto de training:
- cuál es el porcentaje de artículos del subconjunto de testing clasificados correctamente.;
- comparar con el porcentaje anterior;
- explicar las diferencias.

## En este ejercicio se le pide:

### 1. Evaluar distintos modelos N.B.:
- para el dataset que se encuentra en 'data/emails.csv'
- dados los hiperparámetros detallados a continuación
- utilizando la repartición de datos entre training y testing detallada a continuación
- separando los datos después de mezclarlos con numpy.shuffle usando la semilla detallada a continuación

### 2. Reportar el cross-validation score para cada modelo.

### 3. Evaluar el mejor model N.B.:
- utilizando la misma repartición de datos entre training y testing de más arriba

### 4. Reportar el test score para el mejor modelo.

# Solución

In [0]:
# Estos dos comandos evitan que haya que hacer reload cada vez que se modifica un paquete
%load_ext autoreload
%autoreload 2

In [0]:
import numpy as np

import nltk
from   nltk.tokenize import TreebankWordTokenizer
from   nltk.stem     import PorterStemmer, WordNetLemmatizer
from   nltk.corpus   import stopwords

nltk.download('wordnet')
nltk.download('stopwords')

tokenizer  = TreebankWordTokenizer()
stemmer    = PorterStemmer()
lemmatizer = WordNetLemmatizer()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

random_seed = 0
test_size   = 0.3
cross_sets  = 5

hyperparameters_specs = {
    'isalpha': [True, False],
    'alpha':   [0.01, 0.1, 1.0, 10.0],
    'min_df':  [0.01, 0.05, 0.1, 0.49],
    'max_df':  [0.5, 0.75, 0.99],
    'tf_idf':  [True, False],
}

import time
import os
import pickle

import pandas         as pd
import dask.dataframe as dd

from sklearn.model_selection         import train_test_split
from sklearn.model_selection         import cross_val_score
from sklearn.naive_bayes             import MultinomialNB

[nltk_data] Downloading package wordnet to /home/jvega/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jvega/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
# DEFINING DATA AND CACHE PATHS

caching      = True
data_path    = 'data'
dataset_path = os.sep.join([data_path, 'emails.csv'])

def get_nltk_cache_path(hp):
    cache_path = f'data/cache-{hp["isalpha"]}'
    
    return cache_path

def get_nltk_sklearn_cache_path(hp):
    cache_path = f'data/cache-{hp["isalpha"]}-{hp["tf_idf"]}-{hp["min_df"]}-{hp["max_df"]}'
    
    return cache_path

In [0]:
# DEFINING HYPERPARAMETERS

def get_hyperparameters():
    for isalpha in hyperparameters_specs['isalpha']:
        for tf_idf in hyperparameters_specs['tf_idf']:
            for min_df in hyperparameters_specs['min_df']:
                for max_df in hyperparameters_specs['max_df']:
                    for alpha in hyperparameters_specs['alpha']:
                        hyperparameters = {
                            'isalpha': isalpha,
                            'alpha':   alpha,
                            'min_df':  min_df,
                            'max_df':  max_df,
                            'tf_idf':  tf_idf,
                        }
                        
                        yield hyperparameters
                        
hyperparameters = pd.DataFrame()

for hp in get_hyperparameters():
    df = pd.DataFrame(hp, index=[0])
    hyperparameters = hyperparameters.append(df,ignore_index=True)

In [0]:
# DEFINING PREPROCESSORS

def make_nltk_preprocessor(**kwargs):
    def preprocessor(datapoint):
        raw_datapoint          = datapoint
        tokenized_datapoint    = tokenizer.tokenize(raw_datapoint)
        lemmatized_datapoint   = [lemmatizer.lemmatize(x,pos='v') for x in tokenized_datapoint]
        nonstop_datapoint      = [x for x in lemmatized_datapoint if x not in stopwords.words('english')]
        stemmed_datapoint      = [stemmer.stem(x) for x in nonstop_datapoint]
        alphanumeric_datapoint = [x for x in stemmed_datapoint if x.isalpha()]
        filtered_datapoint     = alphanumeric_datapoint if kwargs.setdefault('isalpha', True) \
                                    else stemmed_datapoint
        
        return ' '.join(filtered_datapoint)

    return preprocessor

def run_nltk_preprocessor(hp, dataset=None):
    print('nltk preprocessing...')
    to = time.time()
    
    cache_path = get_nltk_cache_path(hp)
    if not (os.path.exists(cache_path) and os.path.isfile(cache_path)):
        print('cache miss: ', cache_path)

        if caching is True:
            dataset = pd.read_csv(dataset_path)
        else:
            dataset = dataset.copy()
            
        preprocessor    = make_nltk_preprocessor(isalpha=hp['isalpha'])
        ddataset        = dd.from_pandas(dataset, npartitions=os.cpu_count())
        dataset['text'] = ddataset['text'].map_partitions(lambda df: df.apply(preprocessor)).compute(scheduler='multiprocessing')

        if caching is True:
            cache_path = get_nltk_cache_path(hp)
            with open(cache_path, 'wb') as fp:
                pickle.dump(dataset, fp)
        
    tf = time.time()
    #print('finished in', (int(tf-to)), 'seconds.')
    
    if caching is True:
        return
    else:
        return dataset
    
def run_sklearn_preprocessor(hp, dataset=None):
    print('sklearn preprocessing...')
    to = time.time()
    
    cache_path = get_nltk_sklearn_cache_path(hp)
    if not (os.path.exists(cache_path) and os.path.isfile(cache_path)):    
        print('cache miss: ', cache_path)   
        
        if caching is True:
            cache_path = get_nltk_cache_path(hp)
            with open (cache_path, 'rb') as fp:
                dataset = pickle.load(fp)
        else:
            dataset = dataset.copy()

        V = (TfidfVectorizer if hp['tf_idf'] is True else CountVectorizer)(min_df=hp['min_df'], max_df=hp['max_df'])
        X = V.fit_transform(dataset['text']).toarray()
        Y = np.array([dataset['spam'].values]).T
        D = np.hstack((X, Y))

        np.random.seed(seed=random_seed)
        np.random.shuffle(D)

        if caching is True:
            cache_path = get_nltk_sklearn_cache_path(hp)
            with open(cache_path, 'wb') as fp:
                pickle.dump(D, fp)

    tf = time.time()
    #print('finished in', (int(tf-to)), 'seconds.')
    
    if caching is True:
        return
    else:
        return D

In [0]:
# DEFINING CLASSIFIER TRAINING

def make_score_classifier(dataset=None):
    def score_classifier(hp):
        print(hp.to_dict())
        
        if caching is True:
            cache_path = get_nltk_sklearn_cache_path(hp)
            with open (cache_path, 'rb') as fp:
                D = pickle.load(fp)
        else:
            D = dataset.copy()

        X = D[:,:D.shape[1]-1]
        Y = D[:,D.shape[1]-1:].flatten()

        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, shuffle=False)

        clf = MultinomialNB(alpha=hp['alpha'], class_prior=None, fit_prior=False)
        
        scores = cross_val_score(clf, X_train, Y_train, cv=cross_sets)

        hp['score'] = scores.mean()
        
        return hp
    
    return score_classifier

def evaluate_hyperparameters(hyperparameters, dataset=None):
    to = time.time()
    
    score_classifier = make_score_classifier(dataset)
    dhyperparameters = dd.from_pandas(hyperparameters.copy(), npartitions=os.cpu_count())
    scores           = hyperparameters.apply(score_classifier, axis=1)

    tf = time.time()
    print('finished in', (int(tf-to)), 'seconds.')

    return scores

def train_classifier(hp):
    print(hp.to_dict())
    
    if caching is True:
        cache_path = get_nltk_sklearn_cache_path(hp)
        with open (cache_path, 'rb') as fp:
            D = pickle.load(fp)
    else:
        D = dataset.copy()

    X = D[:,:D.shape[1]-1]
    Y = D[:,D.shape[1]-1:].flatten()

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, shuffle=False)
    
    clf = MultinomialNB(alpha=hp['alpha'], class_prior=None, fit_prior=False)
    
    clf.fit(X_train, Y_train)
    
    return clf

def test_classifier(hp, clf):
    print(hp.to_dict())
    
    if caching is True:
        cache_path = get_nltk_sklearn_cache_path(hp)
        with open (cache_path, 'rb') as fp:
            D = pickle.load(fp)
    else:
        D = dataset.copy()

    X = D[:,:D.shape[1]-1]
    Y = D[:,D.shape[1]-1:].flatten()

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, shuffle=False)
    
    score = clf.score(X_test, Y_test)
    
    return score

In [0]:
# PREPROCESSING DATASET
print('preprocessing dataset...')
for index, hp in hyperparameters.iterrows():
    print(hp.to_dict())
    run_nltk_preprocessor(hp)
    run_sklearn_preprocessor(hp)
    print('done.')

preprocessing dataset...
{'isalpha': True, 'alpha': 0.01, 'min_df': 0.01, 'max_df': 0.5, 'tf_idf': True}
nltk preprocessing...
sklearn preprocessing...
cache miss:  data/cache-True-True-0.01-0.5
done.
{'isalpha': True, 'alpha': 0.1, 'min_df': 0.01, 'max_df': 0.5, 'tf_idf': True}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': True, 'alpha': 1.0, 'min_df': 0.01, 'max_df': 0.5, 'tf_idf': True}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': True, 'alpha': 10.0, 'min_df': 0.01, 'max_df': 0.5, 'tf_idf': True}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': True, 'alpha': 0.01, 'min_df': 0.01, 'max_df': 0.75, 'tf_idf': True}
nltk preprocessing...
sklearn preprocessing...
cache miss:  data/cache-True-True-0.01-0.75
done.
{'isalpha': True, 'alpha': 0.1, 'min_df': 0.01, 'max_df': 0.75, 'tf_idf': True}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': True, 'alpha': 1.0, 'min_df': 0.01, 'max_df': 0.75, 'tf_idf': True}
nltk prepro

done.
{'isalpha': True, 'alpha': 0.1, 'min_df': 0.01, 'max_df': 0.99, 'tf_idf': False}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': True, 'alpha': 1.0, 'min_df': 0.01, 'max_df': 0.99, 'tf_idf': False}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': True, 'alpha': 10.0, 'min_df': 0.01, 'max_df': 0.99, 'tf_idf': False}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': True, 'alpha': 0.01, 'min_df': 0.05, 'max_df': 0.5, 'tf_idf': False}
nltk preprocessing...
sklearn preprocessing...
cache miss:  data/cache-True-False-0.05-0.5
done.
{'isalpha': True, 'alpha': 0.1, 'min_df': 0.05, 'max_df': 0.5, 'tf_idf': False}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': True, 'alpha': 1.0, 'min_df': 0.05, 'max_df': 0.5, 'tf_idf': False}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': True, 'alpha': 10.0, 'min_df': 0.05, 'max_df': 0.5, 'tf_idf': False}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': T

done.
{'isalpha': False, 'alpha': 0.1, 'min_df': 0.05, 'max_df': 0.99, 'tf_idf': True}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': False, 'alpha': 1.0, 'min_df': 0.05, 'max_df': 0.99, 'tf_idf': True}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': False, 'alpha': 10.0, 'min_df': 0.05, 'max_df': 0.99, 'tf_idf': True}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': False, 'alpha': 0.01, 'min_df': 0.1, 'max_df': 0.5, 'tf_idf': True}
nltk preprocessing...
sklearn preprocessing...
cache miss:  data/cache-False-True-0.1-0.5
done.
{'isalpha': False, 'alpha': 0.1, 'min_df': 0.1, 'max_df': 0.5, 'tf_idf': True}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': False, 'alpha': 1.0, 'min_df': 0.1, 'max_df': 0.5, 'tf_idf': True}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': False, 'alpha': 10.0, 'min_df': 0.1, 'max_df': 0.5, 'tf_idf': True}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': False,

done.
{'isalpha': False, 'alpha': 0.1, 'min_df': 0.1, 'max_df': 0.99, 'tf_idf': False}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': False, 'alpha': 1.0, 'min_df': 0.1, 'max_df': 0.99, 'tf_idf': False}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': False, 'alpha': 10.0, 'min_df': 0.1, 'max_df': 0.99, 'tf_idf': False}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': False, 'alpha': 0.01, 'min_df': 0.49, 'max_df': 0.5, 'tf_idf': False}
nltk preprocessing...
sklearn preprocessing...
cache miss:  data/cache-False-False-0.49-0.5
done.
{'isalpha': False, 'alpha': 0.1, 'min_df': 0.49, 'max_df': 0.5, 'tf_idf': False}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': False, 'alpha': 1.0, 'min_df': 0.49, 'max_df': 0.5, 'tf_idf': False}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalpha': False, 'alpha': 10.0, 'min_df': 0.49, 'max_df': 0.5, 'tf_idf': False}
nltk preprocessing...
sklearn preprocessing...
done.
{'isalph

In [0]:
# EVALUATING HYPERPARAMETERS
print('evaluating hyperparameters...')
scores = evaluate_hyperparameters(hyperparameters)
print('done.')

evaluating hyperparameters...
{'isalpha': True, 'alpha': 0.01, 'min_df': 0.01, 'max_df': 0.5, 'tf_idf': True}
{'isalpha': True, 'alpha': 0.01, 'min_df': 0.01, 'max_df': 0.5, 'tf_idf': True}
{'isalpha': True, 'alpha': 0.1, 'min_df': 0.01, 'max_df': 0.5, 'tf_idf': True}
{'isalpha': True, 'alpha': 1.0, 'min_df': 0.01, 'max_df': 0.5, 'tf_idf': True}
{'isalpha': True, 'alpha': 10.0, 'min_df': 0.01, 'max_df': 0.5, 'tf_idf': True}
{'isalpha': True, 'alpha': 0.01, 'min_df': 0.01, 'max_df': 0.75, 'tf_idf': True}
{'isalpha': True, 'alpha': 0.1, 'min_df': 0.01, 'max_df': 0.75, 'tf_idf': True}
{'isalpha': True, 'alpha': 1.0, 'min_df': 0.01, 'max_df': 0.75, 'tf_idf': True}
{'isalpha': True, 'alpha': 10.0, 'min_df': 0.01, 'max_df': 0.75, 'tf_idf': True}
{'isalpha': True, 'alpha': 0.01, 'min_df': 0.01, 'max_df': 0.99, 'tf_idf': True}
{'isalpha': True, 'alpha': 0.1, 'min_df': 0.01, 'max_df': 0.99, 'tf_idf': True}
{'isalpha': True, 'alpha': 1.0, 'min_df': 0.01, 'max_df': 0.99, 'tf_idf': True}
{'isalpha

{'isalpha': False, 'alpha': 0.1, 'min_df': 0.01, 'max_df': 0.75, 'tf_idf': True}
{'isalpha': False, 'alpha': 1.0, 'min_df': 0.01, 'max_df': 0.75, 'tf_idf': True}
{'isalpha': False, 'alpha': 10.0, 'min_df': 0.01, 'max_df': 0.75, 'tf_idf': True}
{'isalpha': False, 'alpha': 0.01, 'min_df': 0.01, 'max_df': 0.99, 'tf_idf': True}
{'isalpha': False, 'alpha': 0.1, 'min_df': 0.01, 'max_df': 0.99, 'tf_idf': True}
{'isalpha': False, 'alpha': 1.0, 'min_df': 0.01, 'max_df': 0.99, 'tf_idf': True}
{'isalpha': False, 'alpha': 10.0, 'min_df': 0.01, 'max_df': 0.99, 'tf_idf': True}
{'isalpha': False, 'alpha': 0.01, 'min_df': 0.05, 'max_df': 0.5, 'tf_idf': True}
{'isalpha': False, 'alpha': 0.1, 'min_df': 0.05, 'max_df': 0.5, 'tf_idf': True}
{'isalpha': False, 'alpha': 1.0, 'min_df': 0.05, 'max_df': 0.5, 'tf_idf': True}
{'isalpha': False, 'alpha': 10.0, 'min_df': 0.05, 'max_df': 0.5, 'tf_idf': True}
{'isalpha': False, 'alpha': 0.01, 'min_df': 0.05, 'max_df': 0.75, 'tf_idf': True}
{'isalpha': False, 'alpha'

In [0]:
# REPORTING SCORES
scores

Unnamed: 0,isalpha,alpha,min_df,max_df,tf_idf,score
0,True,0.01,0.01,0.50,True,0.980794
1,True,0.10,0.01,0.50,True,0.978801
2,True,1.00,0.01,0.50,True,0.963333
3,True,10.00,0.01,0.50,True,0.967075
4,True,0.01,0.01,0.75,True,0.980794
5,True,0.10,0.01,0.75,True,0.978801
6,True,1.00,0.01,0.75,True,0.963333
7,True,10.00,0.01,0.75,True,0.967075
8,True,0.01,0.01,0.99,True,0.980794
9,True,0.10,0.01,0.99,True,0.978801


In [0]:
# TRAINING MODEL WITH BEST HYPERPARAMETERS
print('training model with best hyperparameters...')
best_hp = scores.loc[scores['score'].idxmax()].drop(['score'])
clf     = train_classifier(best_hp)
print('done.')

training model with best hyperparameters...
{'isalpha': True, 'alpha': 0.01, 'min_df': 0.01, 'max_df': 0.5, 'tf_idf': True}
done.


In [0]:
# EVALUATING BEST MODEL
print('evaluating best model...')
score = test_classifier(best_hp, clf)*100
print("accuracy: {:.4}%".format(score))

evaluating best model...
{'isalpha': True, 'alpha': 0.01, 'min_df': 0.01, 'max_df': 0.5, 'tf_idf': True}
accuracy: 97.85%
