In [None]:
#from torch.utils.data import Dataset
os.environ["TOKENIZERS_PARALLELISM"]="true"
os.environ["WANDB_DISABLED"] = "true"
!pip install -Uqq datasets
!pip install -Uqq wandb

In [None]:
#check if the environment variable was set correctly
print(os.environ.get('TOKENIZERS_PARALLELISM', ''))
print(os.environ.get('WANDB_DISABLED', ''))

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import datasets #Hugging Face library
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk(ModelPath):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# First part : train a sentence classifier with the entire available data

In [None]:
# read json data as a dictionary 
with open('../input/intent-recognition-chatbot-corpus-from-askubuntu/AskUbuntu Corpus.json', 'r') as f:
  data = json.load(f)
# Intent and Text information are stored in the value corresponding to sentences key 
sentences=data["sentences"]
# Get intent content using list comprehension by looping in the sentences values 
labelList=[i["intent"]for i in sentences]
# Get text content using list comprehension by looping in the sentences values 
textList=[i['text'] for i in sentences]

In [None]:
# Create IntentDataFrame with label list and text list
DFData = {'label' : labelList, 'sentence' : textList}
IntentDataFrame = pd.DataFrame(data = DFData)

In [None]:
# Delete the samples with "None" as label
IntentDataFrame=IntentDataFrame[IntentDataFrame["label"]!="None"]

In [None]:
# check whether the training values are quite balanced
IntentDataFrame["label"].value_counts()

In [None]:
# replace the labels strings by label numbers (should be automated for larger labels sets)
LabelToIndex = {"Software Recommendation":0,"Make Update":1,"Shutdown Computer":2,"Setup Printer":3}
IntentDataFrame["label"]=IntentDataFrame["label"].map(LabelToIndex)

In [None]:
# convert train_df to a dataset so that it can be used by Hugging Face models and tokenizers
train_dataset=datasets.Dataset.from_pandas(IntentDataFrame)

In [None]:
print(train_dataset)

In [None]:
# Remove __index_level_0__ columns
train_dataset=train_dataset.remove_columns(["__index_level_0__"])

In [None]:
# Import AutoTokenizer with checkpoint"distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
# Tokenization work on train_dataset
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding=True)
tokenize_train=train_dataset.map(preprocess_function,batched=True)

In [None]:
# data_collator

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Build model 

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

In [None]:
# Model fine tuning training
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=7,
    weight_decay=0.01,
    #evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenize_train,
    #eval_dataset=tokenize_test,  Here, we work with the entire dataset as training data
    #compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


trainer.train()

# Second part : define the function making the prediction from a sentence input (string), based on the model trained above

In [None]:
def SentenceClassifier(InputSentence):
    """ Take a sentence as input, return the corresponding label
    
    dependencies : tokenizer, trainer
    """
    
    def preprocess_function(examples):
        return tokenizer(examples["sentence"], truncation=True, padding=True)
    
    # here, we are keeping the input as a Dataset, which could allow us to reuse the code
    # to answer many questions at once
    InputSentenceDFData = {'sentence' : [InputSentence]}
    InputSentenceDataFrame = pd.DataFrame(data = InputSentenceDFData)
    InputSentenceDataset = datasets.Dataset.from_pandas(InputSentenceDataFrame)
    Tokenised_InputSentence = InputSentenceDataset.map(preprocess_function,batched=False)
    
    LabelScores = trainer.predict(Tokenised_InputSentence)
    BestLabel = LabelScores.predictions.argmax(1)
    
    IndexToLabel = {0:"Software Recommendation",1:"Make Update",2:"Shutdown Computer",3:"Setup Printer"}
    OutputLabelName = IndexToLabel[BestLabel[0]]
    
    return OutputLabelName

In [None]:
InputSentence = "What should I use to cut pictures ?"
OutputLabel = SentenceClassifier(InputSentence)
print(f'Your question was : "{InputSentence}" it was classified as : "{OutputLabel}"')

# Third part : saving and reloading the model to avoid downloading it from Hugging face
# and avoid repeating the training process

In [None]:
# Save the model and tokenizer locally
os.mkdir("/kaggle/working/model/")
os.mkdir("/kaggle/working/tokenizer/")

ModelPath = "/kaggle/working/model/"
TokenizerPath = "/kaggle/working/tokenizer/"

if os.path.isdir(ModelPath):
    model.save_pretrained(ModelPath)
    print("model ok")
if os.path.isdir(TokenizerPath):
    tokenizer.save_pretrained(TokenizerPath)
    print("tokenizer ok")

In [None]:
os.listdir('./tokenizer')

In [None]:
os.listdir('./model')

In [None]:
# Load the model and tokenizer from a local path\
LocalModel = AutoModelForSequenceClassification.from_pretrained(ModelPath,num_labels=4)
LocalTokenizer = AutoTokenizer.from_pretrained(TokenizerPath)

In [None]:
# check if the classifier works well with the local data

In [None]:
def LocalSentenceClassifier(InputSentence):
    """ Take a sentence as input, return the corresponding label
    
    dependencies : LocalTokenizer, LocalModel
    We use tokenizer2 and trainer2 instead of tokeninzer and trainer
    to be sure that this function works with the data saved and load locally
    """
    
    trainer = Trainer(
        model=LocalModel,
        args=training_args,
        train_dataset=tokenize_train,
        #eval_dataset=tokenize_test,  Here, we work with the entire dataset as training data
        #compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    def preprocess_function(examples):
        return LocalTokenizer(examples["sentence"], truncation=True, padding=True)
    
    # here, we are keeping the input as a Dataset, which could allow us to reuse the code
    # to answer many questions at once
    InputSentenceDFData = {'sentence' : [InputSentence]}
    InputSentenceDataFrame = pd.DataFrame(data = InputSentenceDFData)
    InputSentenceDataset = datasets.Dataset.from_pandas(InputSentenceDataFrame)
    Tokenised_InputSentence = InputSentenceDataset.map(preprocess_function,batched=False)
    
    LabelScores = trainer.predict(Tokenised_InputSentence)
    BestLabel = LabelScores.predictions.argmax(1)
    
    IndexToLabel = {0:"Software Recommendation",1:"Make Update",2:"Shutdown Computer",3:"Setup Printer"}
    OutputLabelName = IndexToLabel[BestLabel[0]]
    
    return OutputLabelName

In [None]:
print(tokenizer)
print("-----------")
print(tokenizer2)

In [None]:
InputSentence = "How can I update Ubuntu ?"
OutputLabel = LocalSentenceClassifier(InputSentence)
print(f'Your question was : "{InputSentence}" it was classified as : "{OutputLabel}"')