# Stroke Prediction Model Development
This notebook develops a stroke prediction model by fine-tuning a pretrained DistilBERT model from Hugging Face. The steps include loading and preprocessing the dataset, fine-tuning the model, evaluating performance, and saving the model.

## 1. Import Libraries

In [10]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from pre_process import preprocess_data

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

## 2. Load and Preprocess Data

In [14]:
# Load dataset
data_path = 'data/stroke_data.csv'
dataset = preprocess_data(data_path)

# Split into train and test
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

print('Training samples:', len(train_dataset))
print('Test samples:', len(test_dataset))

Training samples: 4088
Test samples: 1022


## 3. Load Pretrained Model and Tokenizer

In [16]:
# Load DistilBERT model and tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Binary classification

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 4. Tokenize Dataset

In [17]:
# Tokenize function: Convert features to text for DistilBERT
def tokenize_function(examples):
    text = [f"age: {a}, bmi: {b}, hypertension: {h}, heart_disease: {hd}, avg_glucose_level: {gl}, smoking_status: {s}" 
            for a, b, h, hd, gl, s in zip(
                examples['age'], 
                examples['bmi'], 
                examples['hypertension'], 
                examples['heart_disease'], 
                examples['avg_glucose_level'], 
                examples['smoking_status']
            )]
    return tokenizer(text, padding='max_length', truncation=True, max_length=128)

# Tokenize train and test datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'stroke'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'stroke'])

Map: 100%|██████████| 4088/4088 [00:05<00:00, 768.52 examples/s]
Map: 100%|██████████| 1022/1022 [00:00<00:00, 1290.03 examples/s]


## 5. Fine-Tune Model

In [28]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Define compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

## 6. Evaluate Model

In [None]:
# Evaluate on test set
eval_results = trainer.evaluate()
print('Evaluation Results:', eval_results)

## 7. Save Model and Tokenizer

In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained('../ai-model/model')
tokenizer.save_pretrained('../ai-model/model')
print('Model and tokenizer saved to ../ai-model/model/')