In [17]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [18]:
import json
# read json data as a dictionary 
with open('../input/intent-recognition-chatbot-corpus-from-askubuntu/AskUbuntu Corpus.json', 'r') as f:
  data = json.load(f)
# Intent and Text information are stored in the value corresponding to sentences key 
sentences=data["sentences"]
# Get intent content using list comprehension by looping in the sentences values 
labelList=[i["intent"]for i in sentences]
# Get text content using list comprehension by looping in the sentences values 
textList=[i['text'] for i in sentences]

In [19]:
# Create IntentDataFrame with label list and text list
DFData = {'label' : labelList, 'sentence' : textList}
IntentDataFrame = pd.DataFrame(data = DFData)


In [20]:
IntentDataFrame

In [21]:
IntentDataFrame=IntentDataFrame[IntentDataFrame["label"]!="None"]

In [22]:
IntentDataFrame["label"].value_counts()

In [23]:
IntentDataFrame.shape

In [24]:
IntentDataFrame["label"]=IntentDataFrame["label"].map({"Software Recommendation":0,"Make Update":1,"Shutdown Computer":2,"Setup Printer":3})

In [25]:
IntentDataFrame["label"]

In [26]:
# Split IntentDataFrame into train_df, test_df
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(IntentDataFrame, test_size=0.2,shuffle=True)

In [27]:
#from torch.utils.data import Dataset
!pip install -Uqq datasets
import datasets #Hugging Face library

In [28]:
# convert train_df, test_df to train_dataset, test_dataset
train_dataset=datasets.Dataset.from_pandas(train_df)
test_dataset=datasets.Dataset.from_pandas(test_df)

In [29]:
# Only keep columns: label, sentence
train_dataset=train_dataset.remove_columns(["__index_level_0__"])
test_dataset=test_dataset.remove_columns(["__index_level_0__"])

In [30]:
# Import AutoTokenizer with checkpoint"distilbert-base-uncased"
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [31]:
# Tokenization work on train_dataset and test_dataset
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding=True)
tokenize_train=train_dataset.map(preprocess_function,batched=True)
tokenize_test=test_dataset.map(preprocess_function,batched=True)

In [32]:
# data_collator
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [33]:
# Build model 
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

In [34]:
# Metrics
from datasets import load_metric
metric = load_metric("accuracy")
def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [35]:
# Model fine tuning training
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenize_train,
    eval_dataset=tokenize_test,
    compute_metrics=compute_metrics,
    

    tokenizer=tokenizer,
    data_collator=data_collator,
)

os.environ["WANDB_DISABLED"] = "true"
trainer.train()

In [36]:
# Model evaluation
trainer.evaluate()

In [37]:
# Model save and predict(does not complete)
outputs = trainer.predict(tokenize_test)
y_pred = outputs.predictions.argmax(1)
y_pred

In [39]:
outputs

In [38]:
intent_model=trainer.save_model()