In [1]:
!pip install datasets
!pip install transformers datasets scikit-learn



In [5]:
!pip install --upgrade transformers



In [6]:
# Step 2: Import packages
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [8]:


# Step 3: Example labeled dataset
data = {
    'text': [
        "Borrower has stable job and high credit score.",
        "Multiple late payments and poor credit history.",
        "Owns property and has long employment history.",
        "Recent foreclosure and credit charge-offs."
    ],
    'label': [0, 1, 0, 1]
}
dataset = Dataset.from_dict(data)

# Step 4: Load tokenizer and tokenize text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_fn(example):
    return tokenizer(example['text'], padding='max_length', truncation=True)

dataset = dataset.map(tokenize_fn)
dataset = dataset.train_test_split(test_size=0.5)

# Step 5: Load BERT model for classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Step 6: Define metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Step 7: Trainer setup
'''
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10
)
'''

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Step 8: Train the model
trainer.train()

# Step 9: Evaluate
trainer.evaluate()


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbumpsskkier[0m ([33mbumpsskkier-ffmpeg[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


{'eval_loss': 0.5770449638366699,
 'eval_accuracy': 1.0,
 'eval_f1': 1.0,
 'eval_precision': 1.0,
 'eval_recall': 1.0,
 'eval_runtime': 3.1243,
 'eval_samples_per_second': 0.64,
 'eval_steps_per_second': 0.32,
 'epoch': 3.0}

In [9]:
# Example new data to classify
test_texts = [
    "Applicant has stable income and long credit history.",
    "High debt-to-income ratio and missed payments."
]

# Step 1: Tokenize the new text
tokenized_inputs = tokenizer(test_texts, padding='max_length', truncation=True, return_tensors='pt')

# Step 2: Put model in evaluation mode
model.eval()

# Step 3: Get predictions (no gradients needed)
with torch.no_grad():
    outputs = model(**tokenized_inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, axis=1)

# Step 4: Print predictions
print("Predicted risk classes:", predictions.tolist())  # 0 = Low Risk, 1 = High Risk


Predicted risk classes: [0, 1]
