# Text Classification
 - [Introductory tutorial](https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a)
 - [Summary of traditional to modern approaches](https://towardsdatascience.com/beyond-word-embeddings-part-2-word-vectors-nlp-modeling-from-bow-to-bert-4ebd4711d0ec)

### Multi-Label Classification
 - [Sklearn package](https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html)
 - [Kaggle examples of common methods](https://www.kaggle.com/reiinakano/basic-nlp-bag-of-words-tf-idf-word2vec-lstm)

### Text-Classification using word2vec
 - [word2vec tutorial](http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/)<br>
 - [deep learning turoial](https://datawarrior.wordpress.com/2016/10/12/short-text-categorization-using-deep-neural-networks-and-word-embedding-models/)

In [None]:
import os
import sys
import re
from tqdm import tqdm, tqdm_notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
path = os.getcwd()

# Data Sources
 - [NLTK Reuters data](https://miguelmalvarez.com/2015/03/20/classifying-reuters-21578-collection-with-python-representing-the-data/)


In [None]:
## Download Reuters Data
## https://archive.ics.uci.edu/ml/datasets/Reuters-21578+Text+Categorization+Collection
# !curl -O https://archive.ics.uci.edu/ml/machine-learning-databases/reuters21578-mld/reuters21578.tar.gz
# !tar xzvf reuters21578.tar.gz

 - [20 News Group data](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html)

In [None]:
# Train & Test data
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=False)
twenty_test = fetch_20newsgroups(subset='test', shuffle=False)

twenty_train.keys()

In [None]:
names = {e:v for e,v in enumerate(twenty_train['target_names'])}

In [None]:
len(twenty_train['data']), len(twenty_train['target']), 

In [None]:
n = 0
def printArticle(n):
    global names, twenty_train
    target = twenty_train['target'][n]
    name = names[target]
    doc =  twenty_train['data'][n]
    print('{}\n{}'.format(name,doc))
    
printArticle(n)

In [None]:
# Remove non alpha-numerical chars, lowercase, strip whitespace
.replace('[^a-zA-Z0-9 ]+', ' ', regex=True).str.lower().str.strip()

# Classification
[Classification reports](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [None]:
# def sample(data):
#     X_train, X_test,\
#     y_train, y_test = train_test_split(data['data'],
#                                        data['target'],
#                                        test_size=0.5,
#                                        stratify=data['target'])
#     return y_train

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
CV = CountVectorizer(strip_accents='unicode',
                     lowercase=True,
                     stop_words='english',)

# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
TF = TfidfTransformer()

# Naive Bayes

In [None]:
# model param
NB_params = {'vect__ngram_range':[(1,1),(1,2)],
             'vect__analyzer':['word','char'],
             'tfidf__use_idf':('True'),
             'model__alpha':(1e-1, 1e-3)}

NB_pipe = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('model', MultinomialNB())])

gs_NB = GridSearchCV(estimator=NB_pipe,
                          param_grid=NB_params,
                          n_jobs=-1,
                          cv=4)

%time gs_NB = gs_NB.fit(twenty_train['data'][:5000], twenty_train['target'][:5000])

In [None]:
print(gs_NB.best_score_, gs_NB.best_params_,'\n')
NBcv_predict = gs_NB.predict(twenty_test['data'])
print('NB-CV accuracy: %.3f\n' %np.mean(NBcv_predict == twenty_test['target']))
print(metrics.classification_report(twenty_test['target'],NBcv_predict))

# SGD for SVM & Logit

In [None]:
# model param
SVM_params = {'vect__ngram_range':[(1,1),(1,2)],
              'vect__analyzer':['word','char'],
              'tfidf__use_idf':('True'),
              'model__loss':('hinge','log'),
              'model__alpha':(1e-2, 1e-3),
              'model__penalty': ('l2', 'elasticnet'),
              'model__max_iter': (100),
              'model__tol': (0.21, 1e-3)}

SVM_pipe = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('model',SGDClassifier()),])

gs_SVM = GridSearchCV(SVM_pipe,
                      param_grid=SVM_params,
                      n_jobs=-1,
                      cv=4)

%time gs_SVM = gs_SVM.fit(twenty_train['data'][:5000], twenty_train['target'][:5000])

In [None]:
print(gs_SVM.best_score_, gs_SVM.best_params_)
SVMcv_predict = gs_SVM.predict(twenty_test['data'])

print('SVM-CV accuracy: %.3f' %np.mean(SVMcv_predict == twenty_test['target']))
print(metrics.classification_report(twenty_test['target'], SVMcv_predict))

In [None]:
import scikitplot.plotters as skplt
skplt.plot_confusion_matrix(twenty_test['target'], SVMcv_predict)

# Logit

In [None]:
# LOG_params = {'vect__ngram_range':[(1,1),(1,2)],
#               'tfidf__use_idf':('True','False'),
#               'model__alpha':(1e-1, 1e-3)}

# LOG_pipe = Pipeline([('vect', CountVectorizer()),
#                      ('tfidf', TfidfTransformer()),
#                      ('model',SGDClassifier(loss='log', penalty='l2',alpha=1e-3, max_iter=5, random_state=42)),])

# gs_LOG = GridSearchCV(LOG_pipe, param_grid=LOG_params, n_jobs=-1)
# gs_LOG = gs_LOG.fit(twenty_train['data'][:5000], twenty_train['target'][:5000])

In [None]:
# print(gs_LOG.best_score_, gs_LOG.best_params_)
# LOGcv_predict = gs_LOG.predict(twenty_test['data'])
# print('LOG-CV accuracy: %.3f' %np.mean(LOGcv_predict, twenty_test['target']))
# print(metrics.classification_report(twenty_test['target'], SVMcv_predict))

# ANN

In [None]:
from sklearn.neural_network import MLPClassifier
ANN_params = {'vect__ngram_range':[(1,1),(1,2)],
              'vect__analyzer':['word','char'],
              'tfidf__use_idf':['True'],
              'model__hidden_layer_sizes':[(5,5),(9,5,9)],
              'model__alpha':(1e-1, 1e-3),
              'model__max_iter': [200],
              'model__tol': (1e-1, 1e-3)}

ANN_pipe = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('model',MLPClassifier()),])

gs_ANN = GridSearchCV(ANN_pipe,
                      param_grid=ANN_params,
                      n_jobs=-1,
                      cv=4)

%time gs_ANN = gs_ANN.fit(twenty_train['data'][:5000], twenty_train['target'][:5000])

In [None]:
print(gs_ANN.best_score_, gs_ANN.best_params_)
ANNcv_predict = gs_ANN.predict(twenty_test['data'])
print('ANN-CV accuracy: %.3f' %np.mean(ANNcv_predict, twenty_test['target']))
print(metrics.classification_report(y_test, SVMcv_predict))

In [None]:
# http://scikit-learn.org/stable/modules/model_evaluation.html#multilabel-ranking-metrics
# http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report

# Save Trained Models

In [None]:
2*2

In [None]:
gs_ANN

In [None]:
from sklearn.externals import joblib
model = r'trainedModels'

joblib.dump(gs_NB , os.path.join(model,'gs_NB_1.0.pkl'))
#joblib.dump(gs_LOG, model+'gs_LOG_1.0.pkl') 
joblib.dump(gs_SVM, os.path.join(model,'gs_SVM_1.0.pkl')) 
joblib.dump(gs_ANN, os.path.join(model,'gs_ANN_1.0.pkl'))

# trained-model = joblib.load('filename.pkl') 

# Results

In [None]:
results = pd.DataFrame({'doc':X_test,
                        'target':y_test,
                        'NB':NBcv_predict,
                        'SVM':SVMcv_predict,
                        'LOG':LOGcv_predict,
                        'ANN':ANNcv_predict})

results = results[['doc','target', 'NB', 'SVM', 'LOG', 'ANN']].copy()
results.head()

# Score

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

In [None]:
classes = list(y_test.unique())
classes

In [None]:
Test = label_binarize(y_test, classes=classes)
Test.shape

In [None]:
SVM = label_binarize(SVMcv_predict, classes=classes)
SVM.shape

In [None]:
fig,axs = plt.subplots(1,1, figsize=(7,7))
fpr = dict()
tpr = dict()
roc_auc = dict()

for i,c in enumerate(classes):
    fpr[i], tpr[i], _ = roc_curve(Test[:, i], SVM[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    
    axs.plot(fpr[i], tpr[i], label='%s AUC:%.2f'%(c.upper(),roc_auc[i]))
    axs.legend()
    axs.set_xlim(0,1)
    axs.set_ylim(0,1)