In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB

import joblib

In [2]:
class VotingClassifier(object):
    """Stripped-down version of VotingClassifier that uses prefit estimators"""
    def __init__(self, estimators, voting='hard', weights=None):
        self.estimators = [e[1] for e in estimators]
        self.named_estimators = dict(estimators)
        self.voting = voting
        self.weights = weights

    def fit(self, X, y, sample_weight=None):
        raise NotImplementedError
        
    def predict(self, X):
        """ Predict class labels for X.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        Returns
        ----------
        maj : array-like, shape = [n_samples]
            Predicted class labels.
        """

        # check_is_fitted(self, 'estimators')
        if self.voting == 'soft':
            maj = np.argmax(self.predict_proba(X), axis=1)

        else:  # 'hard' voting
            predictions = self._predict(X)
            maj = np.apply_along_axis(lambda x:
                                      np.argmax(np.bincount(x,
                                                weights=self.weights)),
                                      axis=1,
                                      arr=predictions.astype('int'))
        return maj

    def _collect_probas(self, X):
        """Collect results from clf.predict calls. """
        return np.asarray([clf.predict_proba(X) for clf in self.estimators])

    def _predict_proba(self, X):
        """Predict class probabilities for X in 'soft' voting """
        if self.voting == 'hard':
            raise AttributeError("predict_proba is not available when"
                                 " voting=%r" % self.voting)
        #check_is_fitted(self, 'estimators')
        avg = np.average(self._collect_probas(X), axis=0, weights=self.weights)
        return avg

    @property
    def predict_proba(self):
        """Compute probabilities of possible outcomes for samples in X.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        Returns
        ----------
        avg : array-like, shape = [n_samples, n_classes]
            Weighted average probability for each class per sample.
        """
        return self._predict_proba

    def transform(self, X):
        """Return class labels or probabilities for X for each estimator.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        Returns
        -------
        If `voting='soft'`:
          array-like = [n_classifiers, n_samples, n_classes]
            Class probabilities calculated by each classifier.
        If `voting='hard'`:
          array-like = [n_samples, n_classifiers]
            Class labels predicted by each classifier.
        """
        # check_is_fitted(self, 'estimators')
        if self.voting == 'soft':
            return self._collect_probas(X)
        else:
            return self._predict(X)

    def _predict(self, X):
        """Collect results from clf.predict calls. """
        return np.asarray([clf.predict(X) for clf in self.estimators]).T

In [3]:
df = pd.read_csv('spanish_dialects_dataset.csv')
df.head()

Unnamed: 0,Sentences,Dialect
0,al igual que en el yoga artístico se compite i...,AR
1,sufrió un penal de jacquet por agarrón que el ...,AR
2,nosotros le damos la confianza que necesita cu...,AR
3,dt fernando batista hoy estuvo en el banco su ...,AR
4,los jueces son informados si el competidor est...,AR


In [4]:
vectorizer = TfidfVectorizer(max_features=2000, min_df=3, max_df=0.7)
X = vectorizer.fit_transform(df['Sentences'].values).toarray()

labels_map = {'AR':0, 'CO':1, 'CU':2, 'MX':3, 'PE':4, 'SV':5}
Y = df['Dialect'].map(labels_map).tolist()

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=True, random_state=555)

In [5]:
# Training RandomForest
from sklearn.ensemble import RandomForestClassifier

text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, Y_train)

# Prediction on test set
predictions = text_classifier.predict(X_test)

print('Random Forest Classifier')
print('Confusion Matrix.\n',confusion_matrix(Y_test,predictions))
print('\nClassification Report.\n',classification_report(Y_test,predictions))
print('\nAccuracy Score.\n',accuracy_score(Y_test, predictions))

fname = 'random_forest_classifier.sav'
joblib.dump(text_classifier, fname)

Random Forest Classifier
Confusion Matrix.
 [[303  75  24  56 104  63]
 [ 47 363  47  15  64  65]
 [ 41  75 195  16  66  50]
 [ 94  56  31 155  63  52]
 [ 52  38  18  14 457  35]
 [ 40  89  30  16  44 360]]

Classification Report.
               precision    recall  f1-score   support

           0       0.53      0.48      0.50       625
           1       0.52      0.60      0.56       601
           2       0.57      0.44      0.49       443
           3       0.57      0.34      0.43       451
           4       0.57      0.74      0.65       614
           5       0.58      0.62      0.60       579

    accuracy                           0.55      3313
   macro avg       0.56      0.54      0.54      3313
weighted avg       0.55      0.55      0.55      3313


Accuracy Score.
 0.5532749773619077


['random_forest_classifier.sav']

In [6]:
# Training MultinomialNB
text_classifier = MultinomialNB()
text_classifier.fit(X_train, Y_train)

# Prediction on test set
predictions = text_classifier.predict(X_test)

print('Multinomial NB')
print('Confusion Matrix.\n',confusion_matrix(Y_test,predictions))
print('\nClassification Report.\n',classification_report(Y_test,predictions))
print('\nAccuracy Score.\n',accuracy_score(Y_test, predictions))

fname = 'MultinomialNB_classifier.sav'
joblib.dump(text_classifier, fname)

Multinomial NB
Confusion Matrix.
 [[338  53  31  26 105  72]
 [ 44 386  50  18  62  41]
 [ 33  82 214  15  65  34]
 [ 84  63  29 160  78  37]
 [ 48  27  26   6 484  23]
 [ 41  72  33  13  39 381]]

Classification Report.
               precision    recall  f1-score   support

           0       0.57      0.54      0.56       625
           1       0.57      0.64      0.60       601
           2       0.56      0.48      0.52       443
           3       0.67      0.35      0.46       451
           4       0.58      0.79      0.67       614
           5       0.65      0.66      0.65       579

    accuracy                           0.59      3313
   macro avg       0.60      0.58      0.58      3313
weighted avg       0.60      0.59      0.58      3313


Accuracy Score.
 0.5925143374584968


['MultinomialNB_classifier.sav']

In [7]:
# Training MultinomialNB
text_classifier = GaussianNB()
text_classifier.fit(X_train, Y_train)

# Prediction on test set
predictions = text_classifier.predict(X_test)

print('Gaussian NB')
print('Confusion Matrix.\n',confusion_matrix(Y_test,predictions))
print('\nClassification Report.\n',classification_report(Y_test,predictions))
print('\nAccuracy Score.\n',accuracy_score(Y_test, predictions))

fname = 'GaussianNB_classifier.sav'
joblib.dump(text_classifier, fname)

Gaussian NB
Confusion Matrix.
 [[177  26 171  34 143  74]
 [ 30 227 179  31  68  66]
 [ 13  11 329  20  41  29]
 [ 34  20 132 152  81  32]
 [ 19  13  97   7 457  21]
 [ 23  35 123  17  47 334]]

Classification Report.
               precision    recall  f1-score   support

           0       0.60      0.28      0.38       625
           1       0.68      0.38      0.49       601
           2       0.32      0.74      0.45       443
           3       0.58      0.34      0.43       451
           4       0.55      0.74      0.63       614
           5       0.60      0.58      0.59       579

    accuracy                           0.51      3313
   macro avg       0.55      0.51      0.49      3313
weighted avg       0.56      0.51      0.50      3313


Accuracy Score.
 0.5058859040144884


['GaussianNB_classifier.sav']

In [8]:
# Training SVM
text_classifier = SVC(probability=True)
text_classifier.fit(X_train, Y_train)

# Prediction on test set
predictions = text_classifier.predict(X_test)

print('SVM')
print('Confusion Matrix.\n',confusion_matrix(Y_test,predictions))
print('\nClassification Report.\n',classification_report(Y_test,predictions))
print('\nAccuracy Score.\n',accuracy_score(Y_test, predictions))

fname = 'svm_classifier.sav'
joblib.dump(text_classifier, fname)

SVM
Confusion Matrix.
 [[394  51  30  41  49  60]
 [ 48 411  36  19  40  47]
 [ 44  76 232  15  43  33]
 [100  60  24 208  36  23]
 [ 58  33  19  11 475  18]
 [ 56  79  31  12  32 369]]

Classification Report.
               precision    recall  f1-score   support

           0       0.56      0.63      0.59       625
           1       0.58      0.68      0.63       601
           2       0.62      0.52      0.57       443
           3       0.68      0.46      0.55       451
           4       0.70      0.77      0.74       614
           5       0.67      0.64      0.65       579

    accuracy                           0.63      3313
   macro avg       0.64      0.62      0.62      3313
weighted avg       0.63      0.63      0.63      3313


Accuracy Score.
 0.6305463326290371


['svm_classifier.sav']

In [12]:
# Voting Classifier
svm = joblib.load('svm_classifier.sav')
multinomialNB = joblib.load('MultinomialNB_classifier.sav')
gaussianNB = joblib.load('GaussianNB_classifier.sav')
rfc = joblib.load('random_forest_classifier.sav')


text_classifier = VotingClassifier(estimators=[('svm', svm), ('mNB', multinomialNB), ('gNB', gaussianNB), ('rfc', rfc)], voting='hard')
print('Created Voting Classifier')
# Prediction on test set
predictions = text_classifier.predict(X_test)

print('Voting Classifier')
print('Confusion Matrix.\n',confusion_matrix(Y_test,predictions))
print('\nClassification Report.\n',classification_report(Y_test,predictions))
print('\nAccuracy Score.\n',accuracy_score(Y_test, predictions))

fname = 'voting_classifier.sav'
joblib.dump(text_classifier, fname)

Created Voting Classifier
Voting Classifier
Confusion Matrix.
 [[385  49  33  33  71  54]
 [ 42 400  52  16  50  41]
 [ 37  74 241  12  52  27]
 [ 92  61  35 177  57  29]
 [ 48  24  27   8 487  20]
 [ 50  81  35   9  37 367]]

Classification Report.
               precision    recall  f1-score   support

           0       0.59      0.62      0.60       625
           1       0.58      0.67      0.62       601
           2       0.57      0.54      0.56       443
           3       0.69      0.39      0.50       451
           4       0.65      0.79      0.71       614
           5       0.68      0.63      0.66       579

    accuracy                           0.62      3313
   macro avg       0.63      0.61      0.61      3313
weighted avg       0.63      0.62      0.62      3313


Accuracy Score.
 0.6208874132206459


['voting_classifier.sav']

In [13]:
import sklearn
sklearn.__version__

'0.22.2.post1'

In [14]:
!zip saved_models *.sav

  adding: GaussianNB_classifier.sav (deflated 15%)
  adding: MultinomialNB_classifier.sav (deflated 16%)
  adding: random_forest_classifier.sav (deflated 86%)
  adding: svm_classifier.sav (deflated 99%)
  adding: voting_classifier.sav (deflated 92%)
