## Google Vertex AI (Auto ML)
This notebook shows the necessary steps to prepare the data for Google Auto ML.
At the end of then notebook the evaluation is shown.



In [None]:
!pip uninstall -y fhnw-nlp-utils==0.1.3
!pip install git+https://github.com/TheoHelfenberger/fhnw-nlp-utils@fix_plot_feature_importance
!pip install pyarrow fastparquet

from fhnw.nlp.utils.processing import parallelize_dataframe
from fhnw.nlp.utils.processing import is_iterable
from fhnw.nlp.utils.storage import download
from fhnw.nlp.utils.storage import save_dataframe
from fhnw.nlp.utils.storage import load_dataframe

import pandas as pd
import numpy as np

In [19]:
file = "data/german_news_articles_original_train_and_test_tokenized.parq"
data_all = load_dataframe(file)

In [20]:
data_all.head(3)

Unnamed: 0,text_original,label,split,text_clean,token_clean,token_lemma,token_stem,token_clean_stopwords
0,21-Jähriger fällt wohl bis Saisonende aus. Wie...,Sport,train,Jähriger fällt wohl bis Saisonende aus Wien R...,"[jähriger, fällt, wohl, saisonende, wien, rapi...","[jähriger, fällen, wohl, saisonende, wien, rap...","[jahrig, fallt, wohl, saison, wien, rapid, woh...","[jähriger, fällt, wohl, saisonende, wien, rapi..."
1,"Erfundene Bilder zu Filmen, die als verloren g...",Kultur,train,Erfundene Bilder zu Filmen die als verloren ge...,"[erfundene, bilder, filmen, verloren, gelten, ...","[erfundene, bild, film, verlieren, gelten, the...","[erfund, bild, film, verlor, gelt, the, forbid...","[erfundene, bilder, filmen, verloren, gelten, ..."
2,Der frischgekürte CEO Sundar Pichai setzt auf ...,Web,train,Der frischgekürte CEO Sundar Pichai setzt auf ...,"[frischgekürte, ceo, sundar, pichai, setzt, um...","[frischgekürte, ceo, sundar, pichai, setzen, u...","[frischgekurt, ceo, sundar, pichai, setzt, umg...","[frischgekürte, ceo, sundar, pichai, setzt, um..."


In [21]:
df_forexport = data_all[['text_clean', 'label', 'split']]

In [22]:
df_forexport = df_forexport.rename(columns={'text_clean':'textContent', 'label':'classificationAnnotation', 'split': 'dataItemResourceLabels'})

In [24]:
# Wanted to use the same split as in all the other examples. But I was not able to achieve the train, test validation on Verte AI.
# So the model was computed with a Google create 80:20 stratified automatic split

df_forexport['dataItemResourceLabels'] = df_forexport['dataItemResourceLabels'].apply(lambda split: 'unassigned' if split == 'train' else 'test')

In [25]:
df_forexport.to_csv('data/for_automl3.csv', index=False, sep=',', encoding='UTF-8', header=False, 
                    columns=['dataItemResourceLabels', 'textContent', 'classificationAnnotation'])

In [None]:
!gsutil cp data/for_automl3.csv gs://thelf-data

### Vertex Auto ML calculation took about 3h 48 min.




In [None]:
# Download the Auto ML model evaluation as JSON

# https://cloud.google.com/vertex-ai/docs/training/evaluating-automl-models
!curl -X GET \
-H "Authorization: Bearer "$(gcloud auth application-default print-access-token) \
"https://us-central1-aiplatform.googleapis.com/v1/projects/theos-234311/locations/us-central1/models/7468098480133636096/evaluations" \
--output  auto_ml_evaluation.json

In [4]:
import json

with open('auto_ml_evaluation.json', 'r') as myfile:
    data=myfile.read()
    obj = json.loads(data)

In [33]:
# Downloaded metrics contains ROC curve values.
# All the other models were evaluated at confidenceThreshold = 0.5 (50%)
# Find that object in the list of metrics
metrics = obj['modelEvaluations'][0]['metrics']['confidenceMetrics'] 
thres50 = [metric for metric in metrics if metric.get('confidenceThreshold') == 0.5]
thres50

[{'confidenceThreshold': 0.5,
  'recall': 0.87328094,
  'precision': 0.8775913,
  'f1Score': 0.8754308,
  'recallAt1': 0.87328094,
  'precisionAt1': 0.8775913,
  'f1ScoreAt1': 0.8754308}]

![auto_ml_evaluation.png](auto_ml_evaluation.png)