Import requirements

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import mean_squared_error
import torch

Load the synthetic dataset

In [None]:
data = {
    'question_text': [
        'Explain the concept of time complexity in algorithms.',
        'What is a binary search tree and how does it work?',
        'Write a code snippet in Python to reverse a list.',
        'Discuss the importance of data normalization in databases.',
        'What is the Big O notation?',
        'Compare and contrast SQL and NoSQL databases.',
    ],
    'difficulty_label': [3, 4, 2, 5, 3, 4]
}

df = pd.DataFrame(data)

Split the dataset into train and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['question_text'], df['difficulty_label'], test_size=0.2, random_state=42)

TF-IDF Method

In [None]:
tfidf = TfidfVectorizer()
X_tfidf_train = tfidf.fit_transform(X_train)
X_tfidf_test = tfidf.transform(X_test)

lr_model_tfidf = LinearRegression()
lr_model_tfidf.fit(X_tfidf_train, y_train)
lr_predictions_tfidf = lr_model_tfidf.predict(X_tfidf_test)
lr_mse_tfidf = mean_squared_error(y_test, lr_predictions_tfidf)

BERT Method

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)  # Binary classification

Set up GPU/CPU

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Define training parameters

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 10)

Training loop

In [None]:
model.train()
for epoch in range(750):
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

Evaluation

In [None]:
model.eval()
total_correct = 0
total_samples = 0
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        predicted = torch.argmax(logits, dim=1)
        labels = batch['labels']
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

Calculate accuracy

In [None]:
accuracy = total_correct / total_samples
print(f"Accuracy on test set: {accuracy * 100:.2f}%")

Generate JSON documentation using OpenAI GPT-3

In [None]:
def generate_json_documentation(topic):
    prompt = f"Generate JSON documentation for the topic: {topic}"
    response = openai.Completion.create(
        engine="text-davinci-002",  # You can adjust the engine based on your preference
        prompt=prompt,
        max_tokens=500,  # Adjust the maximum number of tokens based on your needs
        n=1,
        stop=None,
    )
    generated_doc = response['choices'][0]['text']
    return generated_doc

# Example usage
topic_input = "Explain Higher Order Classification in React development"
json_documentation = generate_json_documentation(topic_input)

print(f"Question: {topic_input}")
print(f"Generated JSON documentation:\n{json_documentation}")

# Comparison of models using metrics
from sklearn.metrics import mean_squared_error, f1_score, roc_auc_score, accuracy_score

# Assuming lr_predictions_tfidf and bert_predictions are available
lr_mse_tfidf = mean_squared_error(y_test, lr_predictions_tfidf)
bert_f1 = f1_score(y_test, bert_predictions)
bert_roc_auc = roc_auc_score(y_test, bert_predictions)
bert_accuracy = accuracy_score(y_test, bert_predictions)

print(f"TF-IDF Mean Squared Error: {lr_mse_tfidf}")
print(f"BERT F1 Score: {bert_f1}")
print(f"BERT ROC AUC Score: {bert_roc_auc}")
print(f"BERT Accuracy: {bert_accuracy}")

Visualizing metrics (ROC curve, confusion matrix)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, bert_predictions)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, bert_predictions)

# Plot the confusion matrix
plt.figure(figsize=(6, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False, 
            xticklabels=["Easy", "Hard"], yticklabels=["Easy", "Hard"])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

Classification Report

In [None]:
classification_rep = classification_report(y_true, y_pred)
print("Classification Report:")
print(classification_rep)