In [3]:
# install the libraries necessary for data wrangling, prediction and result analysis
import json
import numpy as np
import pandas as pd
import logging
import time
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, f1_score,precision_score, recall_score
import torch
from numba import cuda

# Install transformers
# (this needs to be done on Kaggle each time you start the session)
#!pip install -q transformers

# Install the simpletransformers
#!pip install -q simpletransformers
from simpletransformers.classification import ClassificationModel

# Install wandb
#!pip install -q wandb
import wandb

# Login to wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtajak[0m (use `wandb login --relogin` to force relogin)


True

In [10]:
# Open csv file
corpus_path = "Macocu-sl-en-doc-format.csv"

corpus_df = pd.read_csv(corpus_path, sep = "\t")

corpus_df.head(2)

Unnamed: 0.1,Unnamed: 0,biroamer_entities,translation_direction,en_source,en_var_doc,en_var_dom,sl_source,en_domain,sl_domain,same_domains,different_domains,average_score,en_doc,sl_doc,en_length,sl_length
0,2584979,No,sl-orig,http://15.liffe.si/?lang_chg=en,B,B,http://15.liffe.si/?lang_chg=sl,15.liffe.si,15.liffe.si,yes,15.liffe.si 15.liffe.si,0.936808,It went out with a bang. The evening sparkled ...,Končalo se je razburljivo in z razkošjem. Veče...,601,488
1,1212933,No,sl-orig,http://16.liffe.si/?lang_chg=en,B,B,http://16.liffe.si/index.php?menu_item=domov,16.liffe.si,16.liffe.si,yes,16.liffe.si 16.liffe.si,0.9,Some days ago the organisers of the 17th Liffe...,Pred dnevi smo se iz 59. mednarodnega filmskeg...,293,184


In [6]:
# See corpus size
corpus_df.shape

(104853, 16)

In [4]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [5]:
# Initialize Wandb
run = wandb.init(project="X-GENRE classifiers", entity="tajak", name="testing-trained-model")

In [7]:
# Load the trained model from Wandb
model_name = "tajak/X-GENRE classifiers/X-GENRE-classifier"
# Use the latest version of the model
model_at = run.use_artifact(model_name + ":latest")
# Download the directory
model_dir = model_at.download()

# Loading a local save
model = ClassificationModel(
    "xlmroberta", model_dir)

[34m[1mwandb[0m: Downloading large artifact X-GENRE-classifier:latest, 1081.88MB. 8 files... Done. 0:0:0


In [9]:
corpus_df.shape[0]

104853

In [11]:
def predict(dataframe, file_path):
    """
    This function takes the dataframe with English documents in the en_doc column, prepared in previous notebooks, and applies the trained model on it to infer predictions. It prints the time that it took to predict to all instances. It saves the results as a new column in the dataframe and returns the dataframe.

    Args:
    - dataframe (pandas DataFrame)
    - file_path: the path to the new CSV file with predictions
    """
    # Calculate the model's predictions
    def make_prediction(input_string):
        return model.predict([input_string])[0][0]

    print("Prediction started.")
    start_time = time.time()

    y_pred = dataframe.en_doc.apply(make_prediction)

    prediction_time = round((time.time() - start_time)/60,2)

    print(f"Prediction completed. It took {prediction_time} minutes for {dataframe.shape[0]} instances - {prediction_time/dataframe.shape[0]} minutes per one instance.")
    
    dataframe["X-GENRE"] = y_pred

    # Save the new dataframe which contains the y_pred values as well
    dataframe.to_csv(f"{file_path}", sep="\t")

    return dataframe

In [22]:
# Try the code on a sample of the corpus
sample_corpus = corpus_df.sample(n = 20)

sample_corpus.shape

(20, 16)

In [23]:
sample_predicted = predict(sample_corpus, "sample-corpus-predicted.csv") 

Prediction started.
Prediction completed. It took 0.36 minutes for 20 instances - 0.018 minutes per one instance.


Prediction took 12 seconds per instance - it would take 350 hours (14 days) to predict everything. 

In [24]:
sample_predicted

Unnamed: 0.1,Unnamed: 0,biroamer_entities,translation_direction,en_source,en_var_doc,en_var_dom,sl_source,en_domain,sl_domain,same_domains,different_domains,average_score,en_doc,sl_doc,en_length,sl_length,X-GENRE
63302,1185598,No,sl-orig,https://www.bankingsupervision.europa.eu/banki...,UNK,B,https://www.bankingsupervision.europa.eu/banki...,bankingsupervision.europa.eu,bankingsupervision.europa.eu,yes,bankingsupervision.europa.eu bankingsupervisio...,0.896375,"In the Single Supervisory Mechanism, the ECB d...",ECB v okviru enotnega mehanizma nadzora neposr...,118,98,Promotion
78201,990862,No,en-orig,https://www.husqvarna.com/au/parts-accessories...,B,B,https://www.husqvarna.com/si/pribor-dodatna-op...,husqvarna.com,husqvarna.com,yes,husqvarna.com husqvarna.com,0.938571,Exclusively designed by Husqvarna the T35 Univ...,"Glava trimerja T35, ki jo je ekskluzivno zasno...",143,70,Promotion
78109,987119,No,sl-orig,https://www.hoteltriglavbled.si/en/contact-us/,B,B,https://www.hoteltriglavbled.si/kontakt/,hoteltriglavbled.si,hoteltriglavbled.si,yes,hoteltriglavbled.si hoteltriglavbled.si,0.914462,"We encourage Green Mobility, therefore we reco...","Spodbujamo zeleno mobilnost, zato vsem gostom ...",186,168,Instruction
36536,1026384,No,sl-orig,https://druzinski-izleti.si/en/regions/coastal...,A,MIX,https://druzinski-izleti.si/regije/obalno-kras...,druzinski-izleti.si,druzinski-izleti.si,yes,druzinski-izleti.si druzinski-izleti.si,0.926381,Route Description Rihemberk Castle is one of t...,Opis poti Grad Rihemberk je najmogočnejši in e...,420,328,Information/Explanation
101032,105985,No,sl-orig,https://www.us-rs.si/decision/?lang=en&amp;vr%...,B,B,https://www.us-rs.si/odlocitev?vd=UP&amp;vr%5B...,us-rs.si,us-rs.si,yes,us-rs.si us-rs.si,0.830982,Administrative Offences Act (Official Gazette ...,Prepoved diskriminacije v enakem obsegu določa...,9587,961,Legal
39239,162410,No,sl-orig,https://eur-lex.europa.eu/legal-content/EN/TXT...,B,B,https://eur-lex.europa.eu/legal-content/SL/TXT...,eur-lex.europa.eu,eur-lex.europa.eu,yes,eur-lex.europa.eu eur-lex.europa.eu,0.94997,It aims to adapt EU law to fight terrorism in ...,Prečiščena različica Pogodbe o Evropski uniji ...,780,947,Information/Explanation
49455,1399933,No,sl-orig,https://oblacila.si/hollister-hollister-majica...,MIX,MIX,https://oblacila.si/mavi-mavi-majica-crna__UrZ...,oblacila.si,oblacila.si,yes,oblacila.si oblacila.si,0.7975,The product on the mannequin is M size. T-shir...,Superdry Vintage Logo Linear LS Tee je moška k...,160,93,Promotion
73006,2015776,No,sl-orig,https://www.fdv.uni-lj.si/en/library/digital-l...,A,MIX,https://www.fdv.uni-lj.si/knjiznica/digitalna-...,fdv.uni-lj.si,fdv.uni-lj.si,yes,fdv.uni-lj.si fdv.uni-lj.si,0.7045,"Unit of additional literature: Author: Hill, C...",Foreign Policy Obvezna literatura:,83,4,Information/Explanation
73806,2555266,No,sl-orig,https://www.fhs.upr.si//en/research/institute-...,A,MIX,https://www.fhs.upr.si/sl/raziskovanje/institu...,fhs.upr.si,fhs.upr.si,yes,fhs.upr.si fhs.upr.si,0.940133,The head of the Institute for Intercultural St...,Predstojnica Inštituta za medkulturne študije ...,335,289,Information/Explanation
16840,240401,Yes,sl-orig,http://www.goricane.si/en/environment/,UNK,B,http://www.goricane.si/sl/okolje/,goricane.si,goricane.si,yes,goricane.si goricane.si,0.92,Green on white The Goričane paper mill has bas...,Zeleno na belem Družba Goričane posluje v skla...,168,110,Information/Explanation


In [25]:
sample_predicted["X-GENRE"].value_counts(normalize=True)

Information/Explanation    0.40
Promotion                  0.30
Legal                      0.15
Instruction                0.10
News                       0.05
Name: X-GENRE, dtype: float64