In [None]:
# !pip  install transformers==4.22.1 -q
# !pip install -U tensorflow==2.16.1 -q
# !pip install keras

In [None]:
!pip install git+https://github.com/huggingface/transformers
!pip install accelerate==0.27.2
!pip install datasets

In [None]:
import transformers
print(transformers.__version__)

In [None]:
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import TextClassificationPipeline

import tensorflow as tf
import pandas as pd
import json
import gc

from sklearn.model_selection import train_test_split

import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopw = stopwords.words('english')

import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import iplot

from tqdm import tqdm

In [None]:
root_path = 'tech_difficulty_data.csv'
df = pd.read_csv(root_path)
df.head()

In [None]:
df.shape

In [None]:
df_new = df.drop('technology', axis = 1)

In [None]:
df_new

In [None]:
df_new['difficulty'].unique()

In [41]:
df1 = df.copy()

In [None]:
df1

In [None]:
df.drop('technology', axis = 1, inplace = True)

In [None]:
# df['label'] = df['technology'].astype('category').cat.codes

In [63]:
df.rename(columns={'difficulty': 'label'}, inplace=True)

In [71]:
# df['label'].unique()
# df.head()
# df2 = df
# If values are outside the range of 0-4, adjust them accordingly.
# For example, if labels are 1-5, subtract 1:
df['label'] = df['label'] - 1

In [None]:
# df.drop('technology', axis = 1, inplace=True)

In [None]:
df

In [73]:
data_texts = df['question'].to_list()

data_labels = df['label'].to_list()

In [74]:
train_df, test_df = train_test_split(df, test_size=0.2)

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Convert datasets to tokenized format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

def tokenize_data(examples):
    return tokenizer(examples["question"], truncation=True)

tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_test = test_dataset.map(tokenize_data, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

# Load pre-trained DistilBERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5)

# Prepare data collator for padding sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_strategy="epoch"
)

# Define Trainer object for training the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
# Train the model
trainer.train()

In [78]:
trainer.save_model('difficulty-classification-model')

In [None]:
trainer.evaluate()

## Load the pre-trained model and predict

In [80]:
tokenizer_fine_tuned = DistilBertTokenizer.from_pretrained('difficulty-classification-model')

model_fine_tuned = AutoModelForSequenceClassification.from_pretrained('difficulty-classification-model')

## Run the model with input

In [128]:
test_text = 'What are lambda expressions in Java, and how are they used?'

In [None]:
predict_input = tokenizer_fine_tuned.encode(
    test_text,
    truncation = True,
    padding = True,
    return_tensors = 'pt'    # Changed 'tf' to 'pt' to return PyTorch tensors
)
predict_input

In [None]:
output = model_fine_tuned(predict_input)
output

In [None]:
# Access logits from output
logits = output.logits
logits

In [None]:
import torch
# Get the predicted class
prediction_value = torch.argmax(logits, axis=1).detach().numpy()[0] # Detach the tensor from the computational graph before converting it to a NumPy array.

print("Predicted class:", prediction_value)

In [None]:
torch.argmax(logits, axis=1)

In [None]:
# Convert logits to probabilities
probabilities = torch.softmax(logits, dim=1)
probabilities