In [None]:
# NOTE: be sure to make a new kernel using the ip

# in the virtualenv
# pip install --user ipykernel
# python -m ipykernel install --user --name=myenv 
# this will create a kernel that uses the virtualenv that you created + will name the kernel myenv

# downloads farm if not available
!pip install farm==0.4.3



In [1]:
# imports
from farm.data_handler.data_silo import DataSilo
from farm.data_handler.processor import TextClassificationProcessor
from farm.modeling.optimization import initialize_optimizer
from farm.infer import Inferencer
from farm.modeling.adaptive_model import AdaptiveModel
from farm.modeling.language_model import LanguageModel
from farm.modeling.prediction_head import MultiLabelTextClassificationHead
from farm.modeling.tokenization import Tokenizer
from farm.train import Trainer
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
import logging
import pandas as pd

05/07/2021 08:03:29 - INFO - transformers.file_utils -   PyTorch version 1.4.0 available.


In [2]:
#############################
#Initial Expirement Settings#
#############################

set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 2
batch_size = 8
evaluate_every = 100

05/07/2021 08:03:35 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None


In [11]:
# the name of the transformer to be used/downloaded from huggingface
lang_model = "bert-base-german-cased" 
do_lower_case = False # determines whether the tokenizer should use the .lower() method

tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=lang_model,
    do_lower_case=do_lower_case)

05/07/2021 08:05:10 - INFO - farm.modeling.tokenization -   Loading tokenizer of type 'BertTokenizer'
05/07/2021 08:05:10 - INFO - transformers.tokenization_utils -   loading file https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt from cache at /Users/mhdredwanalkurdi/.cache/torch/transformers/da299cdd121a3d71e1626f2908dda0d02658f42e925a3d6abd8273ec08cf41a6.31ccc255fc2bad3578089a3997f16b286498ba78c0adc43b5bb2a3f9a0d2c85c


In [12]:
label_list = ['Sport', 'Kultur', 'Web', 'Wirtschaft', 'Inland',"Etat","International","Panorama","Wissenschaft"] #labels in our data set
metric = "acc" # desired metric for evaluation

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=512, # BERT can only handle sequence lengths of up to 512
                                            data_dir='Train_Test_Data', 
                                            label_list=label_list,
                                            label_column_name="genre", # our labels are located in the "genre" column
                                            metric=metric,
                                            quote_char='"',
                                            train_filename="Train_10K_final.csv",
                                            dev_filename=None,
                                            delimiter=';',
                                            multilabel=True, # this means that it is a multiclass classification not a binary one
                                            test_filename="Test_10K_final.csv",
                                            text_column_name='text',
                                            dev_split=0.1 # this will extract 10% of the train set to create a dev set
                                            )

In [13]:
# helper class that will load the data and process them to PT_tensors
data_silo = DataSilo(processor=processor,batch_size=batch_size)

05/07/2021 08:05:37 - INFO - farm.data_handler.data_silo -   
Loading data into the data silo ... 
              ______
               |o  |   !
   __          |:`_|---'-.
  |__|______.-/ _ \-----.|       
 (o)(o)------'\ _ /     ( )      
 
05/07/2021 08:05:37 - INFO - farm.data_handler.data_silo -   Loading train set from: Train_Test_Data/Train_10K_final.csv 
05/07/2021 08:05:38 - INFO - farm.data_handler.data_silo -   Got ya 11 parallel workers to convert 9245 dictionaries to pytorch datasets (chunksize = 169)...
05/07/2021 08:05:38 - INFO - farm.data_handler.data_silo -    0    0    0    0    0    0    0    0    0    0    0 
05/07/2021 08:05:38 - INFO - farm.data_handler.data_silo -   /w\  /w\  /w\  /w\  /w\  /w\  /w\  /|\  /w\  /w\  /w\
05/07/2021 08:05:38 - INFO - farm.data_handler.data_silo -   /'\  / \  /'\  /'\  / \  / \  /'\  /'\  /'\  /'\  /'\
05/07/2021 08:05:38 - INFO - farm.data_handler.data_silo -                       
Preprocessing Dataset Train_Test_Data/Train_10K_fin

05/07/2021 08:05:44 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                             |_|           
______\||/___________________________________________                     

ID: train-108-0
Clear Text: 
 	text: Fünf Schüler der Islamwissenschafterin Lamya Kaddor sind in den Jihad gezogen. STANDARD: Fünf Ihrer ehemaligen Schüler, die Sie seit 2003 im Rahmen des Schulversuchs Islamkunde in deutscher Sprache in einer Hauptschule im Stadtteil Dinslaken-Lohberg unterrichtet haben, sind als heilige Krieger in den Jihad nach Syrien gezogen. Was hat das für Sie bedeutet? Kaddor: Das war für mich ein Schock, weil man ja so gar nicht damit rechnet. Allein die

Preprocessing Dataset Train_Test_Data/Train_10K_final.csv: 100%|██████████| 9245/9245 [00:34<00:00, 268.90 Dicts/s]
05/07/2021 08:06:12 - INFO - farm.data_handler.data_silo -   Loading dev set as a slice of train set
05/07/2021 08:06:12 - INFO - farm.data_handler.data_silo -   Took 964 samples out of train set to create dev set (dev split is roughly 0.1)
05/07/2021 08:06:12 - INFO - farm.data_handler.data_silo -   Loading test set from: Train_Test_Data/Test_10K_final.csv
05/07/2021 08:06:12 - INFO - farm.data_handler.data_silo -   Got ya 11 parallel workers to convert 1028 dictionaries to pytorch datasets (chunksize = 19)...
05/07/2021 08:06:12 - INFO - farm.data_handler.data_silo -    0    0    0    0    0    0    0    0    0    0    0 
05/07/2021 08:06:12 - INFO - farm.data_handler.data_silo -   /w\  /w\  /w\  /|\  /w\  /|\  /|\  /|\  /|\  /w\  /w\
05/07/2021 08:06:12 - INFO - farm.data_handler.data_silo -   /'\  / \  /'\  /'\  / \  /'\  /'\  /'\  /'\  / \  / \
05/07/2021 08:06:12 - 

05/07/2021 08:06:13 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                             |_|           
______\||/___________________________________________                     

ID: train-3-0
Clear Text: 
 	text: 23-jähriger Brasilianer muss vier Spiele pausieren – Entscheidung kann noch angefochten werden – Chile und Bolivien im Viertelfinale, Mexiko out. Santiago de Chile – Wegen seiner Provokation nach dem Spiel gegen Kolumbien hat der südamerikanische Fußballverband (CONMEBOL) Brasiliens Top-Stürmer Neymar für vier Spiele gesperrt. Zudem verhängte der Disziplinarausschuss eine Geldstrafe in Höhe von 10.000 US-Dollar (8.850,34 Euro) gegen den 23-Jähr

Preprocessing Dataset Train_Test_Data/Test_10K_final.csv: 100%|██████████| 1028/1028 [00:04<00:00, 238.33 Dicts/s]
05/07/2021 08:06:17 - INFO - farm.data_handler.data_silo -   Examples in train: 8281
05/07/2021 08:06:17 - INFO - farm.data_handler.data_silo -   Examples in dev  : 964
05/07/2021 08:06:17 - INFO - farm.data_handler.data_silo -   Examples in test : 1028
05/07/2021 08:06:17 - INFO - farm.data_handler.data_silo -   
05/07/2021 08:06:17 - INFO - farm.data_handler.data_silo -   Longest sequence length observed after clipping:     512
05/07/2021 08:06:17 - INFO - farm.data_handler.data_silo -   Average sequence length after clipping: 388.61635068228475
05/07/2021 08:06:17 - INFO - farm.data_handler.data_silo -   Proportion clipped:      0.429658253834078


In [14]:
# loading the pretrained BERT base cased model
language_model = LanguageModel.load(lang_model)
# prediction head for our model that is suited for classifying news article genres
prediction_head = MultiLabelTextClassificationHead(num_labels=len(label_list))

# building the model
model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

05/07/2021 08:06:18 - INFO - transformers.modeling_utils -   loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin from cache at /Users/mhdredwanalkurdi/.cache/torch/transformers/e32f648561b03f77a129832928b7f16decdc5e0870f1e6558857e046169d4133.4e5eda3a0f09b32a0b7d1a9185034da1b3506d5c5b0c6880a7ca0122ab5eef2e
05/07/2021 08:06:20 - INFO - farm.modeling.language_model -   Automatically detected language from language model name: german
05/07/2021 08:06:20 - INFO - farm.modeling.prediction_head -   Prediction head initialized with size [768, 9]


In [15]:
# initilizing the optimizer -> with weight decay 0.01 and a LR of 3e-5
model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

05/07/2021 08:06:20 - INFO - farm.modeling.optimization -   Loading optimizer `TransformersAdamW`: '{'correct_bias': False, 'weight_decay': 0.01, 'lr': 3e-05}'
05/07/2021 08:06:20 - INFO - farm.modeling.optimization -   Using scheduler 'get_linear_schedule_with_warmup'
05/07/2021 08:06:20 - INFO - farm.modeling.optimization -   Loading schedule `get_linear_schedule_with_warmup`: '{'num_warmup_steps': 207.20000000000002, 'num_training_steps': 2072}'


In [16]:
# a helper function that takes care of the training and evaluating the model at a defined number of batches using the dev/holdout set
# it also prints out a nice summary of what is happening during training
trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

In [None]:
# the actual training
# P.S.: GPU required.
trainer.train()

05/06/2021 17:04:18 - INFO - farm.train -   
 

          &&& &&  & &&             _____                   _             
      && &\/&\|& ()|/ @, &&       / ____|                 (_)            
      &\/(/&/&||/& /_/)_&/_&     | |  __ _ __ _____      ___ _ __   __ _ 
   &() &\/&|()|/&\/ '%" & ()     | | |_ | '__/ _ \ \ /\ / / | '_ \ / _` |
  &_\_&&_\ |& |&&/&__%_/_& &&    | |__| | | | (_) \ V  V /| | | | | (_| |
&&   && & &| &| /& & % ()& /&&    \_____|_|  \___/ \_/\_/ |_|_| |_|\__, |
 ()&_---()&\&\|&&-&&--%---()~                                       __/ |
     &&     \|||                                                   |___/
             |||
             |||
             |||
       , -=-~  .-^- _
              `


  0%|          | 0/925 [00:00<?, ?it/s][A
Train epoch 0/2 (Cur. train loss: 0.0000):   0%|          | 0/925 [00:00<?, ?it/s][A
Train epoch 0/2 (Cur. train loss: 0.0000):   0%|          | 1/925 [00:01<24:38,  1.60s/it][A
Train epoch 0/2 (Cur. train loss: 0.7234):   0%

AdaptiveModel(
  (language_model): Bert(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30000, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
            

In [19]:
# saving the trained model -> !! Be Careful, Don't Overwrite the existing model !!
save_dir = "model/bert_10kGNAD"
#model.save(save_dir)
#processor.save(save_dir)

In [17]:
# to download the model from Google Colab -> google Colab only allows you to download zip files for some stupid reason
#!zip -r sample_data/model.zip sample_data/bert_10kGNAD

  adding: sample_data/bert_10kGNAD/ (stored 0%)
  adding: sample_data/bert_10kGNAD/language_model.bin (deflated 7%)
  adding: sample_data/bert_10kGNAD/tokenizer_config.json (stored 0%)
  adding: sample_data/bert_10kGNAD/special_tokens_map.json (deflated 40%)
  adding: sample_data/bert_10kGNAD/prediction_head_0.bin (deflated 8%)
  adding: sample_data/bert_10kGNAD/language_model_config.json (deflated 57%)
  adding: sample_data/bert_10kGNAD/processor_config.json (deflated 53%)
  adding: sample_data/bert_10kGNAD/vocab.txt (deflated 57%)
  adding: sample_data/bert_10kGNAD/prediction_head_0_config.json (deflated 78%)


In [20]:
# loading the trained model
inferenced_model = Inferencer.load(save_dir)

05/07/2021 08:07:59 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
05/07/2021 08:07:59 - INFO - transformers.modeling_utils -   loading weights file sample_data/bert_10kGNAD/language_model.bin from cache at sample_data/bert_10kGNAD/language_model.bin
05/07/2021 08:08:01 - INFO - farm.modeling.adaptive_model -   Found files for loading 1 prediction heads
05/07/2021 08:08:01 - INFO - farm.modeling.prediction_head -   Prediction head initialized with size [768, 9]
05/07/2021 08:08:01 - INFO - farm.modeling.prediction_head -   Loading prediction head from sample_data/bert_10kGNAD/prediction_head_0.bin
05/07/2021 08:08:01 - INFO - transformers.tokenization_utils -   Model name 'sample_data/bert_10kGNAD' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-

05/07/2021 08:42:30 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                             |_|           
______\||/___________________________________________                     

ID: train-0-0
Clear Text: 
 	text: Schauspieler Steven Seagal (69) traf am Mittwoch Venezuelas Diktator Nicolás Maduro (58) in dessen Amtspalast „Miraflores“ in Caracas. Seagal war im Auftrag von Russlands Diktator Wladimir Putin (68) unterwegs, sollte die Beziehung zwischen beiden Unrechtsstaaten verbessern.  Zum Auftakt der PR-Show gab es erst mal ein Gastgeschenk: Steven Seagal zieht aus einem schwarzen Tuch ein Samurai-Schwert hervor und überreicht es Diktator Maduro. Wie Au

05/07/2021 08:42:39 - INFO - farm.data_handler.processor -   *** Show 2 random examples ***
05/07/2021 08:42:39 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                             |_|           
______\||/___________________________________________                     

ID: train-0-0
Clear Text: 
 	text: Schauspieler Steven Seagal (69) traf am Mittwoch Venezuelas Diktator Nicolás Maduro (58) in dessen Amtspalast „Miraflores“ in Caracas. Seagal war im Auftrag von Russlands Diktator Wladimir Putin (68) unterwegs, sollte die Beziehung zwischen beiden Unrechtsstaaten verbessern.  Zum Auftakt der PR-Show gab es erst mal ein Gastgeschenk: Steven Seagal zieht a

05/07/2021 08:42:39 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                             |_|           
______\||/___________________________________________                     

ID: train-0-0
Clear Text: 
 	text: Schauspieler Steven Seagal (69) traf am Mittwoch Venezuelas Diktator Nicolás Maduro (58) in dessen Amtspalast „Miraflores“ in Caracas. Seagal war im Auftrag von Russlands Diktator Wladimir Putin (68) unterwegs, sollte die Beziehung zwischen beiden Unrechtsstaaten verbessern.  Zum Auftakt der PR-Show gab es erst mal ein Gastgeschenk: Steven Seagal zieht aus einem schwarzen Tuch ein Samurai-Schwert hervor und überreicht es Diktator Maduro. Wie Au

05/07/2021 08:47:18 - INFO - farm.data_handler.processor -   *** Show 2 random examples ***
05/07/2021 08:47:18 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                             |_|           
______\||/___________________________________________                     

ID: train-1-0
Clear Text: 
 	text: Wohnraum-Offensive? Von wegen! Diese Zahlen sind ein Armutszeugnis für die Bundesregierung.  Deutschlandweit fehlen laut der Arbeitnehmer-Gewerkschaft IG-Bau 630 000 Wohnungen. Besonders bitter: der „Neubau-Stau“ betrifft demnach fast ausschließlich bezahlbare und Sozialwohnungen.  Seit 2007 hat sich der Sozialwohnungsbestand (derzeit rund 1 Mio.) fast h

05/07/2021 08:47:18 - INFO - farm.data_handler.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                             |_|           
______\||/___________________________________________                     

ID: train-1-0
Clear Text: 
 	text: Wohnraum-Offensive? Von wegen! Diese Zahlen sind ein Armutszeugnis für die Bundesregierung.  Deutschlandweit fehlen laut der Arbeitnehmer-Gewerkschaft IG-Bau 630 000 Wohnungen. Besonders bitter: der „Neubau-Stau“ betrifft demnach fast ausschließlich bezahlbare und Sozialwohnungen.  Seit 2007 hat sich der Sozialwohnungsbestand (derzeit rund 1 Mio.) fast halbiert. Inzwischen ist nur noch jede 20. Wohnung gefördert. Allein in München warten zurzei

In [22]:
# read a text file on disk
def read_file(file_name: str) -> dict:
  text_file = open (file_name, 'r')
  text_file = text_file.read().replace('\n', ' ')
  return {'text': text_file}

In [23]:
# creates a list from multiple text files on disk
def create_input(text_files:list) -> list:
  model_input = list()
  for text_file in text_files:
    model_input.append(read_file(text_file['file']))
  return model_input

In [24]:
# handles the output from the model and saves it in a DF
def create_result_overview (articles:list, result:list) -> pd.DataFrame:
  files = list()
  labels = list()
  predictions = list()
  for i in range(len(articles)):
    files.append (articles[i]['file'])
    labels.append(articles[i]['genre'])
    predictions.append(result[0]['predictions'][i]['label'].strip("'[]'"))
  data = {'file': files, 'actual': labels, 'prediction': predictions}
  df = pd.DataFrame(data)
  return df

In [29]:
# inference
# the two articles were copied from BILD and they are dated 2021
articles = [{'file': 'Inference_Data/International.txt', 'genre': 'International'},
            {'file': 'Inference_Data/Wirtschaft.txt', 'genre': 'Wirtschaft'},
            {'file': 'Inference_Data/Wissenschaft.txt', 'genre': 'Wissenschaft'},
            {'file': 'Inference_Data/Sport.txt', 'genre': 'Sport'}]

article_texts = create_input(articles)

result = inferenced_model.inference_from_dicts(article_texts)

df = create_result_overview(articles, result)

df.head()

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.23 Batches/s]


Unnamed: 0,file,actual,prediction
0,Inference_Data/International.txt,International,International
1,Inference_Data/Wirtschaft.txt,Wirtschaft,Panorama
2,Inference_Data/Wissenschaft.txt,Wissenschaft,Wissenschaft
3,Inference_Data/Sport.txt,Sport,Sport


In [26]:
# TBD
# graphing the results with seaborn/matplotlib would be nice
# cleaning the data before feeding it to the model and re-training
# taking a different 512 cuts (e.g., the first 128 tokens and the last 382 tokens from each article, P.S.: be sure to leave 2 tokens free for the [CLS] and [SEP] tokens)
# trying different LR/Batch_Size
# trying a sentence based approach and averaging the output to get a final inference (might be nice for multilabel per document classification)