# Machine learning for text classification - CUSSON/MARMORET Project
with scikit-learn and nltk


## Agenda

- Model building in scikit-learn (refresher)
- Representing text as numerical data
- Reading a text-based dataset into pandas
- Vectorizing our dataset
- Building and evaluating a model
- Comparing models
- Examining a model for further insight
- Tuning the vectorizer (discussion)
- Some NLP tools to preprocess text

In [2]:
# for Python 2: use print only as a function
from __future__ import print_function

## 1ère étape : Chargement des données

In [3]:
# accéder à l'ensemble des revues
from os import listdir
from os.path import isfile, join
from sklearn.feature_extraction.text import CountVectorizer
import random
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV

#Les algos
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

  from numpy.core.umath_tests import inner1d


In [4]:
# Charge toutes les revues du dossier pos (sous forme de tableau de revues, chaque revue étant ensuite un tableau de lignes)
reviewList = []
mydir = "../Datasets/petit/txt_sentoken/" #Pense bien à modifier ton path ;)

for txt in listdir(mydir + "pos/"):
    if isfile(join(mydir + "pos/", txt)):
        review = open(join(mydir + "pos/", txt), "r")
        reviews = " ".join(review.readlines()).replace("\n", " ")
        reviewList.append((reviews, 0))
        
for txt in listdir(mydir + "neg/"):
    if isfile(join(mydir + "neg/", txt)):
        review = open(join(mydir + "neg/", txt), "r")
        reviews = " ".join(review.readlines()).replace("\n", " ")
        reviewList.append((reviews, 1))
        
random.shuffle(reviewList)
# Use first 70% for training
train_size =int(0.7 * len(reviewList))
train_set, test_set = reviewList[:train_size], reviewList[train_size:]
        
vect_sw = CountVectorizer(stop_words='english', min_df=25, analyzer = "word", ngram_range = (1, 1)) # Jouer avec les valeurs de min et max df

# Séparation des datasets en tableaux utilisables
train_set_unlabeled,train_labels,test_set_unlabeled,test_labels = [], [], [], []

for i in range(len(train_set)) :
    train_set_unlabeled.append(train_set[i][0])
    train_labels.append(train_set[i][1])
for i in range(len(test_set)) :
    test_set_unlabeled.append(test_set[i][0])
    test_labels.append(test_set[i][1])

## Start ML

In [5]:
#Declarative way to do Machine Learning (obsolete since Pipelines)

# import and instantiate a Multinomial Naive Bayes model
#from sklearn.naive_bayes import MultinomialNB
#nb = MultinomialNB()
#%time nb.fit(train_set_transformed, train_labels)
#test_predictions = nb.predict(test_set_transformed)
#from sklearn import metrics
#metrics.accuracy_score(test_labels, test_predictions)

## Naive Bayes

In [17]:
#On créé un pipeline avec les vectorisations de nos mots ainsi que des traitements spécifiques
pipeline_nb = Pipeline([('vect', CountVectorizer(stop_words='english', min_df=25, analyzer = "word")),
                     ('tfidf', TfidfTransformer()),
                     ('nb', MultinomialNB()),
                    ])

#Le grid search permet de comparer directement plusieurs paramètres.
# Instanciation des paramètres
parameters_nb = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
                 'tfidf__use_idf': (True, False),
                 'nb__alpha': (1e-1, 1e-2, 1e-3),
}

GridSearchNaiveBayse = GridSearchCV(pipeline_nb, parameters_nb, n_jobs=-1)

GridSearchNaiveBayse = GridSearchNaiveBayse.fit(train_set_unlabeled, train_labels)

GridSearchNaiveBayse.predict(test_set_unlabeled)

array([0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,

In [7]:
# Get the best parameters for Naive Bayse

print(GridSearchNaiveBayse.best_score_)

for param_name in sorted(parameters_nb.keys()):
    print("%s: %r" % (param_name, GridSearchNaiveBayse.best_params_[param_name]))

GridSearchNaiveBayse.best_estimator_.get_params()["nb"]

0.8142857142857143
nb__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)

## SVM

In [22]:
pipeline_svm = Pipeline([('vect', CountVectorizer(stop_words='english', min_df=25, analyzer = "word")),
                     ('tfidf', TfidfTransformer()),
                     ('svc', SVC()),
                    ])

In [23]:
#https://wikimedia.org/api/rest_v1/media/math/render/svg/513a31a936b91e04dae78cdf630d1d8c7ab5186b <- explication du gamma du RBF
# Instanciation des paramètres relatifs au SVM

parameters_svm = {'tfidf__use_idf': (True, False),
                  'vect__ngram_range': [(1, 1), (1, 2)],
                  'svc__kernel': ['linear','rbf'],
                  'svc__gamma': [0.1, 0.01],
                  'svc__C': [1, 10, 100],
}

GridSearchSVM = GridSearchCV(pipeline_svm, parameters_svm, n_jobs=-1)

GridSearchSVM.fit(train_set_unlabeled, train_labels)

GridSearchSVM.predict(test_set_unlabeled)

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,

In [10]:
# Get the best parameters for the SVM

print(GridSearchSVM.best_score_)

for param_name in sorted(parameters_svm.keys()):
    print("%s: %r" % (param_name, GridSearchSVM.best_params_[param_name]))

GridSearchSVM.best_estimator_.get_params()["svc"]

0.8092857142857143
svc__C: 1
svc__gamma: 0.1
svc__kernel: 'linear'
tfidf__use_idf: True
vect__ngram_range: (1, 1)


SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

## Random Forest

In [None]:
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
pipeline_rf = Pipeline([('vect', CountVectorizer(stop_words='english', min_df=25, analyzer = "word")),
                     ('tfidf', TfidfTransformer()),
                     ('rf', RandomForestClassifier()),
                    ])

In [24]:
# Instanciation des paramètres relatifs à la randomForest

parameters_rf = {'tfidf__use_idf': (True, False),
                 'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
                 'rf__n_estimators': [100, 1000, 2000],
}

GridSearchRF = GridSearchCV(pipeline_rf, parameters_rf, n_jobs=-1)

GridSearchRF.fit(train_set_unlabeled, train_labels)

GridSearchRF.predict(test_set_unlabeled)

array([0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,

In [13]:
# Get the best parameters for Random Forest

print(GridSearchRF.best_score_)

for param_name in sorted(parameters_rf.keys()):
    print("%s: %r" % (param_name, GridSearchRF.best_params_[param_name]))

GridSearchRF.best_estimator_.get_params()["rf"]

0.8278571428571428
rf__n_estimators: 10000
tfidf__use_idf: False
vect__ngram_range: (1, 2)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## Gradient Boosting

In [26]:
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
pipeline_gb = Pipeline([('vect', CountVectorizer(stop_words='english', min_df=25, analyzer = "word")),
                     ('tfidf', TfidfTransformer()),
                     ('gb', GradientBoostingClassifier),
                    ])

pipeline_gb.fit(train_set_unlabeled, train_labels)

TypeError: fit() missing 1 required positional argument: 'y'

In [None]:
# Instanciation des paramètres relatifs à la randomForest

parameters_gb = {'tfidf__use_idf': (True, False),
                 'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
                 'gb__learning_rate': [1e-1, 1e-2, 1e-3],
                 'gb__n_estimators': [100, 1000, 10000],
}

GridSearchGB = GridSearchCV(pipeline_gb, parameters_gb, n_jobs=-1)

GridSearchGB.fit(train_set_unlabeled, train_labels)

GridSearchGB.predict(test_set_unlabeled)

In [16]:
# Get the best parameters for Random Forest

print(GridSearchRF.best_score_)

for param_name in sorted(parameters_rf.keys()):
    print("%s: %r" % (param_name, GridSearchRF.best_params_[param_name]))

GridSearchRF.best_estimator_.get_params()["rf"]

0.825
rf__n_estimators: 1000
tfidf__use_idf: False
vect__ngram_range: (1, 2)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# Ce qui suit vient du doc original, donc pas intéréssant

In [None]:
# check the type of the document-term matrix
type(simple_train_dtm)

In [None]:
# examine the sparse matrix contents
print(simple_train_dtm)

From the [scikit-learn documentation](http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction):

> As most documents will typically use a very small subset of the words used in the corpus, the resulting matrix will have **many feature values that are zeros** (typically more than 99% of them).

> For instance, a collection of 10,000 short text documents (such as emails) will use a vocabulary with a size in the order of 100,000 unique words in total while each document will use 100 to 1000 unique words individually.

> In order to be able to **store such a matrix in memory** but also to **speed up operations**, implementations will typically use a **sparse representation** such as the implementations available in the `scipy.sparse` package.

In [None]:
# example text for model testing
simple_test = ["please don't call me"]

In order to **make a prediction**, the new observation must have the **same features as the training observations**, both in number and meaning.

In [None]:
# transform testing data into a document-term matrix (using existing vocabulary)
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()

In [None]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names())

**Summary:**

- `vect.fit(train)` **learns the vocabulary** of the training data
- `vect.transform(train)` uses the **fitted vocabulary** to build a document-term matrix from the training data
- `vect.transform(test)` uses the **fitted vocabulary** to build a document-term matrix from the testing data (and **ignores tokens** it hasn't seen before)

## Reading a text-based dataset into pandas

In [None]:
# read file into pandas using a relative path
dath = "data/sms.tsv"
sms = pd.read_table(path, header=None, names=["label", "message"])

In [None]:
# alternative: read file into pandas from a URL
#url = "http://www.irisa.fr/dyliss/public/fcoste/data/pub/sms.tsv"
#sms = pd.read_table(url, header=None, names=['label', "message"])

In [None]:
# examine the shape
sms.shape

In [None]:
# examine the first 10 rows
sms.head(10)

In [None]:
# examine the class distribution
sms.label.value_counts()

In [None]:
# convert label to a numerical variable
sms["label_num"] = sms.label.map({"ham":0, "spam":1})

In [None]:
# check that the conversion worked
sms.head(10)

In [None]:
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
X = sms.message
y = sms.label_num
print(X.shape)
print(y.shape)

In [None]:
# split X and y into training and testing sets
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

## Vectorizing our dataset

In [None]:
# instantiate the vectorizer
vect = CountVectorizer()

In [None]:
# learn training data vocabulary, then use it to create a document-term matrix
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [None]:
# equivalently: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)

In [None]:
# examine the document-term matrix
X_train_dtm

In [None]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

## Building and evaluating a model

We will use [multinomial Naive Bayes](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html):

> The multinomial Naive Bayes classifier is suitable for classification with **discrete features** (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [None]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [None]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

In [None]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [None]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

In [None]:
# print message text for the false positives (ham incorrectly classified as spam)
X_test[(y_pred_class==1) & (y_test==0)] # using pandas' indexing by boolean array

In [None]:
# print message text for the false negatives (spam incorrectly classified as ham)
X_test[(y_pred_class < y_test)]

In [None]:
# example false negative
X_test[3132]

**predict_proba(X)**

    Return probability estimates for the test vector X.
    Parameters:	
    X : array-like, shape = [n_samples, n_features]
    Returns: 
    C : array-like, shape = [n_samples, n_classes]

        Returns the probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute classes.


In [None]:
X_test_dtm

In [None]:
nb.predict_proba(X_test_dtm)

In [None]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

In [None]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

## Comparing models

We will compare multinomial Naive Bayes with [logistic regression](http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression):

> Logistic regression, despite its name, is a **linear model for classification** rather than regression. Logistic regression is also known in the literature as logit regression, maximum-entropy classification (MaxEnt) or the log-linear classifier. In this model, the probabilities describing the possible outcomes of a single trial are modeled using a logistic function.

In [None]:
# import and instantiate a logistic regression model 
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [None]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)

In [None]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [None]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

In [None]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

In [None]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

## Examining a model for further insight 
We will examine the our **trained Naive Bayes model** to calculate the approximate **"spamminess" of each token**.

In [None]:
# store the vocabulary of X_train
X_train_tokens = vect.get_feature_names()
len(X_train_tokens)

In [None]:
# examine the first 50 tokens
print(X_train_tokens[0:50])

In [None]:
# examine the last 50 tokens
print(X_train_tokens[-50:])

In [None]:
# Naive Bayes counts the number of times each token appears in each class
# trailing underscore is scikit convention for attributes that are learned during model fitting
nb.feature_count_

In [None]:
# rows represent classes, columns represent tokens
nb.feature_count_.shape

In [None]:
# number of times each token appears across all HAM messages
ham_token_count = nb.feature_count_[0, :]
ham_token_count

In [None]:
# number of times each token appears across all SPAM messages
spam_token_count = nb.feature_count_[1, :]
spam_token_count

In [None]:
# create a DataFrame of tokens with their separate ham and spam counts
tokens = pd.DataFrame({"token":X_train_tokens, "ham":ham_token_count, "spam":spam_token_count}).set_index("token")
tokens.head()

In [None]:
# examine 5 random DataFrame rows
tokens.sample(5, random_state=6)

In [None]:
# Naive Bayes counts the number of observations in each class
nb.class_count_

Before we can calculate the _"spamminess"_ of each token, we need to avoid **multiplying by zero** and account for the **class imbalance**.

In [None]:
# add 1 to ham and spam counts to avoid 0 probabilities
tokens['ham'] = tokens['ham'] + 1
tokens['spam'] = tokens['spam'] + 1
tokens.sample(5, random_state=6)

In [None]:
# convert the ham and spam counts into frequencies
tokens['ham'] = tokens['ham'] / nb.class_count_[0]
tokens['spam'] = tokens['spam'] / nb.class_count_[1]
tokens.sample(5, random_state=6)

In [None]:
# calculate the ratio of spam-to-ham for each token
tokens['spam_ratio'] = tokens['spam'] / tokens['ham']
tokens.sample(5, random_state=6)

In [None]:
# examine the DataFrame sorted by spam_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
tokens.sort_values('spam_ratio', ascending=False)

In [None]:
# look up the spam_ratio for a given token
tokens.loc["dating", "spam_ratio"]

## Tuning the vectorizer (discussion)

Thus far, we have been using the default parameters of [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html):

In [None]:
# show default parameters for CountVectorizer
vect

However, the vectorizer is worth tuning, just like a model is worth tuning! Here are a few parameters that you might want to tune:

- **stop_words:** string {'english'}, list, or None (default)
    - If 'english', a built-in stop word list for English is used.
    - If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens.
    - If None, no stop words will be used.

In [None]:
# remove English stop words
vect_sw = CountVectorizer(stop_words='english')

In [None]:
vect_sw.fit(X_train)
X_train_dtm_sw = vect_sw.transform(X_train)
print(X_train_dtm_sw.shape) 
print(X_train_dtm.shape)

- **ngram_range:** tuple (min_n, max_n), default=(1, 1)
    - The lower and upper boundary of the range of n-values for different n-grams to be extracted.
    - All values of n such that min_n <= n <= max_n will be used.

In [None]:
# include 1-grams and 2-grams
vect_ngram = CountVectorizer(ngram_range=(1, 2))

- **max_df:** float in range [0.0, 1.0] or int, default=1.0
    - When building the vocabulary, ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).
    - If float, the parameter represents a proportion of documents.
    - If integer, the parameter represents an absolute count.

In [None]:
# ignore terms that appear in more than 50% of the documents
vect_maxdf = CountVectorizer(max_df=0.5)

- **min_df:** float in range [0.0, 1.0] or int, default=1
    - When building the vocabulary, ignore terms that have a document frequency strictly lower than the given threshold. (This value is also called _"cut-off"_ in the literature.)
    - If float, the parameter represents a proportion of documents.
    - If integer, the parameter represents an absolute count.

In [None]:
# only keep terms that appear in at least 2 documents
vect_mindf = CountVectorizer(min_df=2)

**Guidelines for tuning CountVectorizer:**

- Use your knowledge of the **problem** and the **text**, and your understanding of the **tuning parameters**, to help you decide what parameters to tune and how to tune them.
- **Experiment**, and let the data tell you the best approach!

### TF-IDF weighting

From the [scikit-learn documentation](http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html):

> Occurrence count is a good start but there is an issue: longer documents will have higher average count values than shorter documents, even though they might talk about the same topics.

> To avoid these potential discrepancies it suffices to divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called tf for Term Frequencies.

> Another refinement on top of tf is to downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus.

> This downscaling is called tf–idf for “Term Frequency times Inverse Document Frequency”.

In [None]:
# Computing TF only
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False)
X_train_tf = tf_transformer.fit_transform(X_train_dtm)
X_train_tf.shape

In [None]:
### Computing TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_dtm)
X_train_tfidf.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb_tfidf = MultinomialNB().fit(X_train_tfidf, y_train)

In [None]:
X_test_tfidf = tfidf_transformer.transform(X_test_dtm)
print(X_test_tfidf.shape)
y_pred_class_tfidf = nb_tfidf.predict(X_test_tfidf)

In [None]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class_tfidf)

In [None]:
metrics.confusion_matrix(y_test, y_pred_class_tfidf)

### Building a pipeline

> In order to make the vectorizer => transformer => classifier easier to work with, scikit-learn provides a Pipeline class that behaves like a compound classifier:

In [None]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('nb', MultinomialNB()),
                    ])

> The names vect, tfidf and clf (classifier) are arbitrary. We shall see their use in the section on grid search, below. We can now train the model with a single command:

In [None]:
text_clf.fit(X_train, y_train)  

In [None]:
pred = text_clf.predict(X_test)

In [None]:
metrics.confusion_matrix(y_test, pred)

In [None]:
print(metrics.classification_report(y_test, pred,
    target_names=["ham","spam"]))

__How does pipeline chain the estimators?__   
By successive 'fit' and 'transform' (except the last estimator that is only fitted)   
From [scikit-learn documentation](http://scikit-learn.org/stable/modules/pipeline.html):
> Calling fit on the pipeline is the same as calling fit on each estimator in turn, transform the input and pass it on to the next step. The pipeline has all the methods that the last estimator in the pipeline has, i.e. if the last estimator is a classifier, the Pipeline can be used as a classifier. If the last estimator is a transformer, again, so is the pipeline.

### Parameter tuning using grid search

Parameters of the estimators in the pipeline can be accessed using the < estimator \> \_\_ < parameter \> syntax:

In [None]:
text_clf.get_params()

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'nb__alpha': (1e-2, 1e-3),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [None]:
gs_clf = gs_clf.fit(X_train, y_train)

The result of calling fit on a GridSearchCV object is a classifier that we can use to predict:

In [None]:
gs_clf.predict(X_test)

In [None]:
gs_clf.best_score_  

In [None]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))


In [None]:
gs_clf.best_estimator_

In [None]:
gs_clf.best_estimator_.get_params()["nb"]

In [None]:
gs_clf.cv_results_

In [None]:
import pandas as pd
df = pd.DataFrame(gs_clf.cv_results_)
print(df)

## Some NLP tools to preprocess text

In [None]:
# download ressources
import nltk

# download all popular ressources
#nltk.download('popular')

# download required ressources for this notebook
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

### Stop words

In [None]:
from nltk.corpus import stopwords
set(stopwords.words('english'))

### Tokenizing words and sentences

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

example_text = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."

print("Token sentences:\n",sent_tokenize(example_text))
print("Token words:\n",word_tokenize(example_text))

```
# Unsupervised learning of tokenizer: PunktSentenceTokenizer
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
nltk.download("state_union")
train_text = state_union.raw("2005-GWBush.txt")
test_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(test_text)
```

### Stemming

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

example_words = ["car", "cars", "churches", "going", "gone", "goes", "went", "geese"]
print("words:", example_words)

ps = PorterStemmer()
print("stems:",[ps.stem(w) for w in example_words])

In [None]:
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."

In [None]:
words = word_tokenize(new_text)
print([ps.stem(w) for w in words])

#### how to add stemming support to CountVectorizer (sklearn)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk.stem

stemmer = nltk.stem.SnowballStemmer('english')
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

vectorizer_s = StemmedCountVectorizer(min_df=3, analyzer="word", stop_words='french')

### Lemmatizing

In [None]:
from nltk.stem import WordNetLemmatizer
#nltk.download("wordnet")
print("Words:")
print([w for w in example_words])

lemmatizer = WordNetLemmatizer()
print("Lemmatized as noun (default):")
print([lemmatizer.lemmatize(w) for w in example_words])
    
print("Lemmatized as verb:")
print([lemmatizer.lemmatize(w,'v') for w in example_words])

 

### POS (Part of speech) tagging

In [None]:
# We need to tokenize first the text
text = word_tokenize("They refuse to permit us to obtain the refuse permit")
print(text)
# Then we can perform POS
print(nltk.pos_tag(text))





[Penn Treebank POS tags](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html):

CC:	coordinating conjunction   
CD:	cardinal digit   
DT:	determiner   
EX:	existential there (like: _"there is"_ ... think of it like _"there exists"_)   
FW:	foreign word   
IN:	preposition/subordinating conjunction   
JJ:	adjective	_"big"_   
JJR:	adjective, comparative _"bigger"_   
JJS:	adjective, superlative _"biggest"_   
LS:	list marker	1)   
MD:	modal	could, will   
NN:	noun, singular _"desk"_   
NNS:	noun plural	_"desks"_   
NNP:	proper noun, singular _"Harrison"_   
NNPS:	proper noun, plural	_"Americans"_   
PDT:	predeterminer _"all the kids"_   
POS:	possessive ending _"parent's"_   
PRP:	personal pronoun	_"I, he, she"_   
PRP\$:	possessive pronoun	_"my, his, hers"_   
RB:	adverb	_"very, silently,"_   
RBR:	adverb, comparative	_"better"_   
RBS:	adverb, superlative	_"best"_   
RP:	particle	_"give up"_   
TO:	to	_"go 'to' the store."_   
UH:	interjection	_"errrrrrrrm"_   
VB:	verb, base form	_"take"_   
VBD:	verb, past tense	_"took"_   
VBG:	verb, gerund/present participle	_"taking"_   
VBN:	verb, past participle	_"taken"_   
VBP:	verb, sing. present, non-3d	_"take"_   
VBZ:	verb, 3rd person sing. present	_"takes"_   
WDT:	wh-determiner	_"which"_   
WP:	wh-pronoun	_"who, what"_   
WP\$:	possessive wh-pronoun	_"whose"_   
WRB:	wh-abverb	_"where, when"_   

In [None]:
from nltk.corpus import state_union
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download("state_union")
su_text = state_union.raw("2006-GWBush.txt")
sentence_tokens = sent_tokenize(su_text)

In [None]:
su_text[:500]

In [None]:
sentence_tokens[1:5]

In [None]:
# Look at POS for 5 first sentences of State of the Union address from 2006 from  past President George W. Bush.
try:
    for i in sentence_tokens[1:5]:
        words = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(words)
        print(tagged)
except Exception as e:
    print(str(e))


#### Penn TreeBank tags to WordNet pos

In [None]:
from nltk.corpus import wordnet
print(wordnet._FILEMAP)

In [None]:
# a simple converter 
def get_wordnet_pos(treebank_tag):
    """
    return WORDNET POS compliance to WORDNET lemmatization (a,n,r,v) 
    """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN

### Lemmatization with context

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
try:
    for i in sentence_tokens[1:2]:
        words = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(words)        
        lemma_pos_token = [(lemmatizer.lemmatize(w, pos=get_wordnet_pos(pos_tag)),pos_tag) for (w, pos_tag) in tagged] 
        print(lemma_pos_token)
except Exception as e:
    print(str(e))

## Deepenings

- Sequences can be handled by SVM without transformation into fixed-sized vectors features by using the kernel trick and "string kernels"...

- __New trend__: vector representation of words (word2vec, ...)   
Not yet in scikit-learn, but you can look at http://zablo.net/blog/post/twitter-sentiment-analysis-python-scikit-word2vec-nltk-xgboost for an example using word2vec with scikit-learn...