## Language AutoML

In [None]:
!pip3 install auto-sklearn
!pip install scikit-learn==0.24.0
!python -m pip install dask distributed --upgrade
!pip install stringify

### Pre-Processing

In [None]:
!wget https://github.com/Tiagoblima/indigenous-corpus/raw/main/lang_cls_large.csv

--2021-04-18 23:07:24--  https://github.com/Tiagoblima/indigenous-corpus/raw/main/lang_cls_large.csv
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/Tiagoblima/indigenous-corpus/main/lang_cls_large.csv [following]
--2021-04-18 23:07:24--  https://raw.githubusercontent.com/Tiagoblima/indigenous-corpus/main/lang_cls_large.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18838202 (18M) [text/plain]
Saving to: ‘lang_cls_large.csv.3’


2021-04-18 23:07:25 (39.8 MB/s) - ‘lang_cls_large.csv.3’ saved [18838202/18838202]



In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

In [None]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
    w = w.strip().lower()

    # creating a space between a word and the punctuation following it eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping
    # -punctuation
    w = re.sub(r"([?.!,¿#@0-9])", r"", w)

    return w

In [None]:
import pandas as pd

lang_df = pd.read_csv('lang_cls_large.csv')
lang_df = lang_df.dropna().reset_index()
samples = lang_df.sample(frac=0.1)

samples['TEXT'] = samples['TEXT'].apply(preprocess_sentence)
#samples = pd.read_csv('samples.csv')


classes = np.unique(samples['LANG'].tolist())
classes

array(['Achuar', 'Afrikaans', 'Aguaruna', 'Akawaio', 'Albanian',
       'Amharic', 'Amuzgo', 'Apalaí', 'Apinayé', 'Apurinã', 'Arabic',
       'Armenian', 'Aukan', 'Bakairi', 'Barasana', 'Basque', 'Bulgarian',
       'Cabecar', 'Cakchiquel', 'Campa', 'Camsa', 'Cebuano', 'Chamorro',
       'Cherokee', 'Chinantec', 'Chinese', 'Coptic', 'Creole', 'Croatian',
       'Czech', 'Danish', 'Dinka', 'English', 'Esperanto', 'Estonian',
       'Ewe', 'Farsi', 'Finnish', 'French', 'Gaelic', 'Galela', 'German',
       'Greek', 'Guajajara', 'Guarani', 'Gujarati', 'Hebrew', 'Hindi',
       'Hungarian', 'Icelandic', 'Indonesian', 'Italian', 'Jakalteko',
       'Japanese', "K'iche'", 'Kabyle', 'Kadiwéu', 'Kagwahiva', 'Kaigáng',
       'Kaiwá', 'Kannada', 'Karajá', 'Kayabí', 'Korean', 'Latin',
       'Latvian', 'Lithuanian', 'Lukpa', 'Macushi', 'Malagasy',
       'Malayalam', 'Mam', 'Manx', 'Maori', 'Marathi', 'Maxakalí',
       'Mundurukú', 'Myanmar', 'Nadëb', 'Nahuatl', 'Nambikuára', 'Nepali',
       'N

## AutoML Traning

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import autosklearn.classification

X, y = samples['TEXT'].to_numpy(), samples['LANG'].to_numpy()
tfidf = TfidfVectorizer(analyzer='char', ngram_range=(2, 3))
X_transformed = tfidf.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y)


  self.re = re.compile(self.reString)


In [None]:
import autosklearn.classification
automl = autosklearn.classification.AutoSklearnClassifier(
    include_estimators=["multinomial_nb", "liblinear_svc", "sgd"], exclude_estimators=None,
   )
automl.fit(X_train, y_train)
predictions = automl.predict(X_test)



In [None]:
print(automl.cv_results_)
print(automl.sprint_statistics())
automl.show_models()

{'mean_test_score': array([0.        , 0.        , 0.97630332, 0.        , 0.95450237,
       0.        , 0.        , 0.96919431, 0.04123223, 0.        ,
       0.        , 0.        , 0.        , 0.00900474, 0.        ,
       0.        , 0.90189573, 0.        ]), 'mean_fit_time': array([360.13551998, 360.1222775 ,  39.4647274 , 360.03834033,
        66.66854978, 290.79554462,  64.4710505 , 158.56999373,
        16.61797523, 276.57715416, 360.03500986,  36.04357743,
       268.19521356,  26.3282187 , 360.05038667, 360.12069416,
        27.66403317,  84.03683877]), 'params': [{'balancing:strategy': 'none', 'classifier:__choice__': 'liblinear_svc', 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'mean', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'standard

"[(0.460000, SimpleClassificationPipeline({'balancing:strategy': 'none', 'classifier:__choice__': 'sgd', 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'most_frequent', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'standardize', 'feature_preprocessor:__choice__': 'nystroem_sampler', 'classifier:sgd:alpha': 0.043782453619910996, 'classifier:sgd:average': 'True', 'classifier:sgd:fit_intercept': 'True', 'classifier:sgd:learning_rate': 'constant', 'classifier:sgd:loss': 'perceptron', 'classifier:sgd:penalty': 'l2', 'classifier:sgd:tol': 0.003506214936809261, 'data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction': 0.010949712691945572, 'feature_preprocessor:nystroem_sampler:kernel': 'cosine', 'feature_preprocessor

In [None]:
print(automl.cv_results_['mean_test_score'])

[0.         0.         0.97630332 0.         0.95450237 0.
 0.         0.96919431 0.04123223 0.         0.         0.
 0.         0.00900474 0.         0.         0.90189573 0.        ]


In [None]:
from sklearn.metrics import classification_report
y_true = [0, 1, 2, 2, 2]
y_pred = [0, 0, 2, 2, 1]
target_names = list(set(y_test))
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

    Aguaruna       0.92      1.00      0.96        22
       Dinka       1.00      1.00      1.00        10
  Indonesian       1.00      1.00      1.00        19
     Cabecar       1.00      1.00      1.00        10
   Uspanteco       1.00      1.00      1.00        13
  Cakchiquel       1.00      1.00      1.00        19
  Potawatomi       1.00      1.00      1.00        17
    Wolaytta       1.00      1.00      1.00        17
   Mundurukú       1.00      1.00      1.00        11
       Maori       1.00      1.00      1.00        22
     Cebuano       1.00      1.00      1.00        19
     Latvian       1.00      0.94      0.97        18
        Manx       1.00      1.00      1.00        12
     Myanmar       1.00      1.00      1.00        20
       Paite       1.00      1.00      1.00        11
   Kagwahiva       1.00      1.00      1.00        14
   Rikbaktsa       1.00      1.00      1.00        15
    Estonian       1.00    

In [None]:
import pandas as pd

report = classification_report(y_test, predictions, target_names=target_names, output_dict=True)
pd.DataFrame.from_dict(report, orient='index').to_csv('report.csv')

AttributeError: ignored

In [None]:
from google.colab import files
files.download('report.csv') 

FileNotFoundError: ignored