In [None]:
# !pip  install transformers==4.22.1 -q
# !pip install -U tensorflow==2.16.1 -q
# !pip install keras

!pip uninstall -y torch
!pip uninstall -y transformers

In [None]:
# !pip install git+https://github.com/huggingface/transformers
!pip install accelerate==0.27.2
!pip install datasets
!pip install transformers
!pip install torch

In [None]:
!pip list

In [None]:
import transformers
print(transformers.__version__)

In [None]:
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import TextClassificationPipeline

import tensorflow as tf
import pandas as pd
import json
import gc

from sklearn.model_selection import train_test_split

import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopw = stopwords.words('english')

import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import iplot

from tqdm import tqdm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Function to process and augment the questions
def process_questions(file_info):
    df = pd.read_csv(file_info['file_path'], sep='|', header=None, names=['question', 'difficulty'])

    # Strip whitespace from the columns
    df['question'] = df['question'].str.strip()

    # Add the technology column
    df['technology'] = file_info['technology']

    # Preprocess questions
    # df['question'] = df['question'].apply(preprocess)

    # Generate paraphrased questions
    # df['paraphrased_question'] = df['question'].apply(lambda x: paraphrase(x, paraphraser_model, paraphraser_tokenizer))

    return df

# Load question datasets and preprocess
java_questions = process_questions({'file_path': 'java.txt', 'technology': 'java'})
microservice_questions = process_questions({'file_path': 'microservice', 'technology': 'microservice'})
springboot_questions = process_questions({'file_path': 'springboot', 'technology': 'springboot'})
mysql_questions = process_questions({'file_path': 'database', 'technology': 'database'})
docker_k8s_questions = process_questions({'file_path': 'devops', 'technology': 'devops'})

# Concatenate all questions
questions_df = pd.concat([java_questions, microservice_questions, springboot_questions, mysql_questions, docker_k8s_questions], ignore_index=True)

In [None]:
df = questions_df
df.shape

(2494, 3)

In [None]:
df_new = df.drop('difficulty', axis = 1)

In [None]:
df_new

In [None]:
df_new['technology'].unique()

array(['java', 'microservice', 'springboot', 'database', 'devops'],
      dtype=object)

In [None]:
df1 = df.copy()

In [None]:
df1

In [None]:
df.drop('difficulty', axis = 1, inplace = True)

In [None]:
df['label'] = df['technology'].astype('category').cat.codes
# Get the mapping of technologies to labels
category_mapping = df[['technology', 'label']].drop_duplicates().sort_values('label')

# Create a dictionary with label as key and technology as value
technology_dict = dict(zip(category_mapping['label'], category_mapping['technology']))

technology_dict

{0: 'database', 1: 'devops', 2: 'java', 3: 'microservice', 4: 'springboot'}

In [None]:
df['label'].unique()
df.head()
df2 = df

In [None]:
df.drop('technology', axis = 1, inplace=True)

In [None]:
df

Unnamed: 0,question,label
0,What is a class in Java?,2
1,How do you define a method in Java?,2
2,What is the purpose of the main method in Java?,2
3,What is a variable in Java?,2
4,How do you declare an array in Java?,2
...,...,...
2489,What is the purpose of a Kubernetes Job?,1
2490,How do you manage access control in Kubernetes?,1
2491,"What are taints and tolerations, and how are t...",1
2492,How do you create and use Helm repositories?,1


In [None]:
data_texts = df['question'].to_list()

data_labels = df['label'].to_list()

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2)

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Convert datasets to tokenized format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

def tokenize_data(examples):
    return tokenizer(examples["question"], truncation=True)

tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_test = test_dataset.map(tokenize_data, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]


`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884



Map:   0%|          | 0/1995 [00:00<?, ? examples/s]

Map:   0%|          | 0/499 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

# Load pre-trained DistilBERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5)

# Prepare data collator for padding sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch"
)

# Define Trainer object for training the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,0.1032,0.174406
2,0.0489,0.154115


TrainOutput(global_step=500, training_loss=0.07604176139831544, metrics={'train_runtime': 1034.5466, 'train_samples_per_second': 3.857, 'train_steps_per_second': 0.483, 'total_flos': 23627031526680.0, 'train_loss': 0.07604176139831544, 'epoch': 2.0})

In [38]:
trainer.save_model('tech-classification-model')

In [39]:
trainer.evaluate()

{'eval_loss': 0.15411542356014252,
 'eval_runtime': 20.9644,
 'eval_samples_per_second': 23.802,
 'eval_steps_per_second': 3.005,
 'epoch': 2.0}

## Load a pretrained model and predict -- Run separately

In [None]:
%pip install git+https://github.com/huggingface/transformers
%pip install tensorflow

In [40]:
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, DistilBertTokenizer
import tensorflow as tf

In [41]:
tokenizer_fine_tuned = DistilBertTokenizer.from_pretrained('tech-classification-model')

model_fine_tuned = AutoModelForSequenceClassification.from_pretrained('tech-classification-model')

## Run the input

In [56]:
test_text = 'what is the command to see logs inside of a container?'

In [57]:
predict_input = tokenizer_fine_tuned.encode(
    test_text,
    truncation = True,
    padding = True,
    return_tensors = 'pt'
)
predict_input

tensor([[  101,  2054,  2003,  1996,  3094,  2000,  2156, 15664,  2503,  1997,
          1037, 11661,  1029,   102]])

In [58]:
output = model_fine_tuned(predict_input)[0]

In [59]:
import torch


prediction_value = torch.argmax(output, axis=1).detach().numpy()[0]

technology_dict[prediction_value]

'devops'

In [60]:
# Convert logits to probabilities
probabilities = torch.softmax(output, dim=1)
probabilities

tensor([[1.1256e-04, 9.9912e-01, 5.2914e-05, 6.8815e-04, 2.4173e-05]],
       grad_fn=<SoftmaxBackward0>)