In [2]:
"""Build a language detector model

The goal of this exercise is to train a linear classifier on text features
that represent sequences of up to N consecutive characters so as to be
recognize natural languages by using the frequencies of short character
sequences as 'fingerprints'.

The script saves the trained model to disk for later use
"""
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# License: Simplified BSD
# Adapted by: Francesco Mosconi

import numpy as np
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd

# The training data folder must be passed as first argument
try:
    dataset = load_files('./wikidata/short_paragraphs')
except OSError as ex:
    print(ex)
    print("Couldn't import the data, did you unzip the wikidata.zip folder?")
    exit(-1)




In [3]:
type(dataset)

sklearn.utils.Bunch

In [4]:
dataset.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [10]:
dataset.target_names

['ar', 'de', 'en', 'es', 'fr', 'it', 'nl', 'pl', 'pt', 'ru']

In [11]:
docs = dataset.data
y = dataset.target

In [12]:
# TASK: Split the dataset in training and test set
# (use 20% of the data for test):

from sklearn.model_selection import train_test_split
docs_train, docs_test, y_train, y_test = train_test_split(docs, y, test_size=.2, random_state=0)

In [53]:
# TASK: Build a vectorizer that splits
# strings into sequence of 1 to 3
# characters instead of word tokens
# using the class TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(ngram_range=(1, 5), analyzer='char_wb')
#vec = TfidfVectorizer(ngram_range=(1, 3), analyzer='char')
#vec = TfidfVectorizer(ngram_range=(1, 20), analyzer='word')

In [59]:
# TASK: Use the function make_pipeline to build a
#       vectorizer / classifier pipeline
#       using the previous analyzer
#       and a classifier of choice.
#       The pipeline instance should be
#       stored in a variable named model

from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

#est = DecisionTreeClassifier()
#est = LogisticRegression()
est = LogisticRegression(C=10)
model = make_pipeline(vec, est)

In [60]:
# TASK: Fit the pipeline on the training set

model.fit(docs_train, y_train)



Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='char_wb', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 5), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('logisticregression',
                 LogisticRegression(C=10, class_weight=None, dual=False,
                                    fit_interc

In [61]:
# TASK: Fit the pipeline on the training set

model.fit(docs_train, y_train)



Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='char_wb', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 5), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('logisticregression',
                 LogisticRegression(C=10, class_weight=None, dual=False,
                                    fit_interc

In [62]:
# TASK: Predict the outcome on the testing set.
# Store the result in a variable named y_predicted

y_predicted = model.predict(docs_test)

In [63]:
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd

# TASK: Print the classification report
print(classification_report(y_test, y_predicted))

# TASK: Print the confusion matrix. Bonus points if you make it pretty
cr = classification_report(y_test, y_predicted)
cm = confusion_matrix(y_test, y_predicted)
cmdf = pd.DataFrame(cm, index=dataset.target_names, columns=dataset.target_names)
print(cmdf)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        44
           1       0.99      0.99      0.99       206
           2       0.95      0.99      0.97       223
           3       0.97      0.97      0.97       228
           4       0.98      0.99      0.99       189
           5       1.00      0.98      0.99       208
           6       0.99      0.95      0.97        83
           7       0.97      0.98      0.97       127
           8       0.94      0.95      0.95       195
           9       1.00      0.98      0.99       189

    accuracy                           0.98      1692
   macro avg       0.98      0.98      0.98      1692
weighted avg       0.98      0.98      0.98      1692

    ar   de   en   es   fr   it  nl   pl   pt   ru
ar  43    0    1    0    0    0   0    0    0    0
de   0  203    0    0    0    0   0    3    0    0
en   0    1  221    1    0    0   0    0    0    0
es   0    0    0  221    0    1   0  

In [None]:
vec.fit_transform()

In [None]:
# TASK: Split the dataset in training and test set
# (use 20% of the data for test):
docs_train, docs_test, y_train, y_test = train_test_split(docs, y, test_size=.2, random_state=0)

# TASK: Build a an vectorizer that splits
# strings into sequence of 1 to 3
# characters instead of word tokens
# using the class TfidfVectorizer
vec = TfidfVectorizer(ngram_range=(1, 3), analyzer='char')

# TASK: Use the function make_pipeline to build a
#       vectorizer / classifier pipeline
#       using the previous analyzer
#       and a classifier of choice.
#       The pipeline instance should be
#       stored in a variable named model
est = DecisionTreeClassifier()
model = make_pipeline(vec, est)

# TASK: Fit the pipeline on the training set


# TASK: Predict the outcome on the testing set.
# Store the result in a variable named y_predicted


# TASK: Print the classification report


# TASK: Print the confusion matrix. Bonus points if you make it pretty.


# TASK: Is the score good? Can you improve it changing
#       the parameters or the classifier?
#       Try using cross validation and grid search

# TASK: Use dill and gzip to persist the trained model in memory.
#       1) gzip.open a file called my_model.dill.gz
#       2) dump to the file both your trained classifier
#          and the target_names of the dataset (for later use)
#    They should be passed as a list [model, dataset.target_names]
