In [None]:
# !pip  install transformers==4.22.1 -q
# !pip install -U tensorflow==2.16.1 -q
# !pip install keras

In [None]:
!pip install git+https://github.com/huggingface/transformers
!pip install accelerate==0.27.2
!pip install datasets

In [None]:
import transformers
print(transformers.__version__)

In [None]:
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import TextClassificationPipeline

import tensorflow as tf
import pandas as pd
import json
import gc

from sklearn.model_selection import train_test_split

import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopw = stopwords.words('english')

import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import iplot

from tqdm import tqdm

In [None]:
root_path = 'tech_difficulty_data.csv'
df = pd.read_csv(root_path)
df.head()

In [None]:
df.shape

In [6]:
df_new = df.drop('difficulty', axis = 1)

In [None]:
df_new

In [None]:
df_new['technology'].unique()

In [9]:
df1 = df.copy()

In [None]:
df1

In [10]:
df.drop('difficulty', axis = 1, inplace = True)

In [11]:
df['label'] = df['technology'].astype('category').cat.codes

In [12]:
df['label'].unique()
df.head()
df2 = df

In [13]:
df.drop('technology', axis = 1, inplace=True)

In [None]:
df

In [14]:
data_texts = df['question'].to_list()

data_labels = df['label'].to_list()

In [15]:
train_df, test_df = train_test_split(df, test_size=0.2)

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Convert datasets to tokenized format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

def tokenize_data(examples):
    return tokenizer(examples["question"], truncation=True)

tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_test = test_dataset.map(tokenize_data, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

# Load pre-trained DistilBERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Prepare data collator for padding sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch"
)

# Define Trainer object for training the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
# Train the model
trainer.train()

In [19]:
trainer.save_model('tech-classification-model')

In [None]:
trainer.evaluate()

## Load a pretrained model and predict -- Run separately

In [None]:
%pip install git+https://github.com/huggingface/transformers
%pip install tensorflow

In [3]:
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, DistilBertTokenizer
import tensorflow as tf

In [4]:
tokenizer_fine_tuned = DistilBertTokenizer.from_pretrained('tech-classification-model')

model_fine_tuned = AutoModelForSequenceClassification.from_pretrained('tech-classification-model')

## Run the input

In [30]:
test_text = 'What is a class in Java?'

In [31]:
predict_input = tokenizer_fine_tuned.encode(
    test_text,
    truncation = True,
    padding = True,
    return_tensors = 'pt'
)
predict_input

tensor([[ 101, 2054, 2003, 1037, 2465, 1999, 9262, 1029,  102]])

In [32]:
output = model_fine_tuned(predict_input)[0]

In [15]:
import torch


prediction_value = torch.argmax(output, axis=1).detach().numpy()[0]

print("Predicted class:", prediction_value)

Predicted class: 1


In [33]:
# Convert logits to probabilities
probabilities = torch.softmax(output, dim=1)
probabilities

tensor([[2.2171e-03, 9.9705e-01, 7.3540e-04]], grad_fn=<SoftmaxBackward0>)