In [1]:
import nltk
from sklearn.metrics import f1_score

nltk.download('reuters')

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/aurelien/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [4]:
from nltk.corpus import reuters
print('training files : ', len([fid for fid in reuters.fileids() if fid[:5] == 'train']))
print('testing files : ', len([fid for fid in reuters.fileids() if fid[:4] == 'test']))

training files :  7769
testing files :  3019


In [5]:
# Fonction pour obtenir le texte du document en supprimant les sauts de ligne '\n'
def get_text(fileid):
    return reuters.raw(fileid).replace('\n', ' ')

# Fonction qui retourne le label 1 pour 'grain' et 0 sinon
def get_label(fileid):
    return 1 if 'grain' in reuters.categories(fileid) else 0

In [9]:
import pandas as pd
from nltk.corpus import reuters

# Récupération des différents documents en fonction du nom de fichier
train_data = [[get_text(fileid), get_label(fileid)] for fileid in reuters.fileids() if fileid.startswith('train')]
test_data = [[get_text(fileid), get_label(fileid)] for fileid in reuters.fileids() if fileid.startswith('test')]

# Création des DataFrames pandas
train = pd.DataFrame(train_data, columns=['text', 'label'])
test = pd.DataFrame(test_data, columns=['text', 'label'])

In [None]:
from time import time
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline
import numpy as np
from pprint import pprint
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

default_pipeline = Pipeline(
    [
        ("vect", TfidfVectorizer()),
        ("clf", ComplementNB()),
    ]
)

experimental_pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("clf", SGDClassifier(loss='hinge'))
    ]
)

parameter_grid_default = {
    "vect__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    "vect__min_df": (1, 3, 5, 10),
    "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    "vect__norm": ("l1", "l2"),  # normalization options for TFIDF
    "clf__alpha": np.logspace(-6, 6, 13),
}

# Define the parameter grid for the experimental pipeline (with CountVectorizer)
parameter_grid_experimental = {
    "vect__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    "vect__min_df": (1, 3, 5, 10),
    "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    "clf__alpha": np.logspace(-6, 6, 13),
}

def set_search(experimental=False):
    pipeline = experimental_pipeline if experimental else default_pipeline
    parameter_grid = parameter_grid_experimental if experimental else parameter_grid_default

    print("Selecting pipeline...")
    print(pipeline)

    return [GridSearchCV(pipeline, parameter_grid, scoring='f1_micro'), parameter_grid]

def test_data(search):
    print("Evaluating pipeline...")
    start = time()
    search.fit(train['text'], train['label'])
    end = time()
    print(f"Training time: {end - start:.3f}s")

    test_accuracy = search.score(test['text'], test['label'])

    print(f"Accuracy on test set: {test_accuracy:.3f}")

def display_params(search, parameter_grid):
    print("Best parameters combination found:")
    best_parameters = search.best_estimator_.get_params()
    for param_name in sorted(parameter_grid.keys()):
        print(f"{param_name}: {best_parameters[param_name]}")

def search_best_parameters(experimental=False):
    [search, parameter_grid] = set_search(experimental)

    test_data(search)
    display_params(search, parameter_grid)

for experimental in [False, True]:
    search_best_parameters(experimental)

#TODO :
# 1) Présenter le score f1 sur l'ensemble de test avec la meilleure configuration trouvée
# 2) Présenter la loss et l'accuracy
# 3) Pourquoi pas ajouter une matrice de confusion sur les résultats de la classe "grain"

Selecting pipeline...
Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', ComplementNB())])
Evaluating pipeline...
