In [1]:
!pip install datasets
!pip install transformers datasets scikit-learn



In [5]:
!pip install --upgrade transformers



In [6]:
# Step 2: Import packages
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:


# Step 3: Example labeled dataset
data = {
    'text': [
        "Borrower has stable job and high credit score.",
        "Multiple late payments and poor credit history.",
        "Owns property and has long employment history.",
        "Recent foreclosure and credit charge-offs."
    ],
    'label': [0, 1, 0, 1]
}
dataset = Dataset.from_dict(data)

# Step 4: Load tokenizer and tokenize text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_fn(example):
    return tokenizer(example['text'], padding='max_length', truncation=True)

dataset = dataset.map(tokenize_fn)
dataset = dataset.train_test_split(test_size=0.5)

# Step 5: Load BERT model for classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Step 6: Define metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Step 7: Trainer setup
'''
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10
)
'''

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Step 8: Train the model
trainer.train()

# Step 9: Evaluate
trainer.evaluate()


In [9]:
# Example new data to classify
test_texts = [
    "Applicant has stable income and long credit history.",
    "High debt-to-income ratio and missed payments."
]

# Step 1: Tokenize the new text
tokenized_inputs = tokenizer(test_texts, padding='max_length', truncation=True, return_tensors='pt')

# Step 2: Put model in evaluation mode
model.eval()

# Step 3: Get predictions (no gradients needed)
with torch.no_grad():
    outputs = model(**tokenized_inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, axis=1)

# Step 4: Print predictions
print("Predicted risk classes:", predictions.tolist())  # 0 = Low Risk, 1 = High Risk


Predicted risk classes: [0, 1]
