In [1]:
import pandas as pd
import numpy as np
import torch
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, T5Tokenizer, AutoModelForSeq2SeqLM






In [2]:
pip install tf-keras

Note: you may need to restart the kernel to use updated packages.


In [3]:
df = pd.read_csv("ai4i2020.csv")

In [4]:
target = "Machine failure"

In [5]:
X = df.drop(columns=[target])
y = df[target]

In [6]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
# 2. Load Chronos (Prompt-based mode)
model_name = "amazon/chronos-t5-small"

# Use the slow tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small", use_fast=False)

# Load the T5 model using the correct class
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [8]:
def create_prompt(features):
    """
    Convert structured tabular row into a text prompt for LLM.
    Re-engineered to be a multiple-choice-style task for better zero-shot performance.
    """
    prompt = (
        f"Based on the following machine operational data, is a machine failure likely to occur?\n"
        f"Data points: UDI={features['UDI']}, Product ID={features['Product ID']}, Type={features['Type']},\n"
        f"Air Temp={features['Air temperature [K]']}, Process Temp={features['Process temperature [K]']},\n"
        f"Rotational Speed={features['Rotational speed [rpm]']}, Torque={features['Torque [Nm]']},\n"
        f"Tool Wear={features['Tool wear [min]']}, TWF={features['TWF']}, HDF={features['HDF']},\n"
        f"PWF={features['PWF']}, OSF={features['OSF']}, RNF={features['RNF']}.\n"
        f"Answer with either 'Failure is likely' or 'Failure is not likely'."
    )
    return prompt

def predict_prompt_improved(row):
    """
    Generate a prediction with improved parsing.
    """
    prompt = create_prompt(row)
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)
    
    # Generate a more concise output
    outputs = model.generate(**inputs, max_length=15, num_beams=5, early_stopping=True)
    pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True).lower()
    
    # Improved parsing logic
    if "failure is likely" in pred_text or "likely" in pred_text:
        return 1
    elif "failure is not likely" in pred_text or "not likely" in pred_text:
        return 0
    else:
        # Fallback to simple parsing if the desired phrase isn't found
        if "1" in pred_text:
            return 1
        return 0

In [9]:
def create_few_shot_prompt(features, examples):
    """
    Create a few-shot prompt with in-context examples.
    """
    prompt_with_examples = ""
    for ex_features, ex_label in examples:
        prompt_with_examples += (
            f"Machine data: UDI={ex_features['UDI']}, Product ID={ex_features['Product ID']}, "
            f"Type={ex_features['Type']}, Air Temp={ex_features['Air temperature [K]']}, "
            f"Process Temp={ex_features['Process temperature [K]']}, Rotational Speed={ex_features['Rotational speed [rpm]']}, "
            f"Torque={ex_features['Torque [Nm]']}, Tool Wear={ex_features['Tool wear [min]']}, "
            f"TWF={ex_features['TWF']}, HDF={ex_features['HDF']}, PWF={ex_features['PWF']}, "
            f"OSF={ex_features['OSF']}, RNF={ex_features['RNF']}.\n"
            f"Prediction: {'Failure' if ex_label == 1 else 'No Failure'}\n\n"
        )

    # Add the new instance to the prompt
    prompt_with_examples += (
        f"Machine data: UDI={features['UDI']}, Product ID={features['Product ID']}, "
        f"Type={features['Type']}, Air Temp={features['Air temperature [K]']}, "
        f"Process Temp={features['Process temperature [K]']}, Rotational Speed={features['Rotational speed [rpm]']}, "
        f"Torque={features['Torque [Nm]']}, Tool Wear={features['Tool wear [min]']}, "
        f"TWF={features['TWF']}, HDF={features['HDF']}, PWF={features['PWF']}, "
        f"OSF={features['OSF']}, RNF={features['RNF']}.\n"
        f"Prediction:"
    )
    return prompt_with_examples

# In your main script:
# Select a few examples from your training data (e.g., 5 examples)
# Ensure you have a mix of positive and negative cases for balance
example_indices = [23, 55, 120, 200, 310]
few_shot_examples = [(X_train.iloc[i], y_train.iloc[i]) for i in example_indices]

# Now, use this in your prediction loop
def predict_few_shot(row, examples):
    prompt = create_few_shot_prompt(row, examples)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_length=20)
    pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True).lower()
    
    if "failure" in pred_text:
        return 1
    return 0

# Test the new function
y_pred_few_shot = [predict_few_shot(X_test.iloc[i], few_shot_examples) for i in range(50)]
y_true_few_shot = y_test.iloc[:50].tolist()

print("\nüìä Few-shot In-Context Learning Performance (Zero-shot):")
print("Accuracy:", np.mean(np.array(y_true_few_shot) == np.array(y_pred_few_shot)))

Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors



üìä Few-shot In-Context Learning Performance (Zero-shot):
Accuracy: 0.92


In [None]:
# 3. Few-Shot Prompt Builder
def create_few_shot_prompt(features, examples):
    prompt_with_examples = ""
    for ex_features, ex_label in examples:
        prompt_with_examples += (
            f"Machine data: UDI={ex_features['UDI']}, Product ID={ex_features['Product ID']}, "
            f"Type={ex_features['Type']}, Air Temp={ex_features['Air temperature [K]']}, "
            f"Process Temp={ex_features['Process temperature [K]']}, Rotational Speed={ex_features['Rotational speed [rpm]']}, "
            f"Torque={ex_features['Torque [Nm]']}, Tool Wear={ex_features['Tool wear [min]']}, "
            f"TWF={ex_features['TWF']}, HDF={ex_features['HDF']}, PWF={ex_features['PWF']}, "
            f"OSF={ex_features['OSF']}, RNF={ex_features['RNF']}.\n"
            f"Prediction: {'Failure' if ex_label == 1 else 'No Failure'}\n\n"
        )

    prompt_with_examples += (
        f"Machine data: UDI={features['UDI']}, Product ID={features['Product ID']}, "
        f"Type={features['Type']}, Air Temp={features['Air temperature [K]']}, "
        f"Process Temp={features['Process temperature [K]']}, Rotational Speed={features['Rotational speed [rpm]']}, "
        f"Torque={features['Torque [Nm]']}, Tool Wear={features['Tool wear [min]']}, "
        f"TWF={features['TWF']}, HDF={features['HDF']}, PWF={features['PWF']}, "
        f"OSF={features['OSF']}, RNF={features['RNF']}.\n"
        f"Prediction:"
    )
    return prompt_with_examples

def predict_few_shot(row, examples):
    prompt = create_few_shot_prompt(row, examples)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_length=20)
    pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True).lower()

    if "failure" in pred_text:
        return 1
    return 0

# -------------------------------
# 4. Automated Example Selection
# -------------------------------
def get_balanced_examples(X_train, y_train, n=10):
    """Randomly sample balanced few-shot examples."""
    pos_idx = y_train[y_train == 1].index.tolist()
    neg_idx = y_train[y_train == 0].index.tolist()

    pos_samples = random.sample(pos_idx, min(n//2, len(pos_idx)))
    neg_samples = random.sample(neg_idx, min(n//2, len(neg_idx)))

    indices = pos_samples + neg_samples
    examples = [(X_train.loc[i], y_train.loc[i]) for i in indices]
    return examples

# -------------------------------
# 5. Run Few-Shot Inference
# -------------------------------
few_shot_examples = get_balanced_examples(X_train, y_train, n=20)  # try with 10, 20

y_pred_few_shot = []
for i in range(len(X_test)):
    pred = predict_few_shot(X_test.iloc[i], few_shot_examples)
    y_pred_few_shot.append(pred)

y_true = y_test.tolist()

# -------------------------------
# 6. Evaluation
# -------------------------------
acc = accuracy_score(y_true, y_pred_few_shot)
prec = precision_score(y_true, y_pred_few_shot)
rec = recall_score(y_true, y_pred_few_shot)
f1 = f1_score(y_true, y_pred_few_shot)
cm = confusion_matrix(y_true, y_pred_few_shot)

print("\nüìä Few-Shot In-Context Learning Performance (Chronos-T5):")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_true, y_pred_few_shot))

Expert prompting


In [None]:
def create_expert_prompt(features):
    """
    Expert-level prompt for zero-shot machine failure prediction.
    This prompt uses a structured, chain-of-thought approach with clear roles
    and output formatting to guide the model to a highly accurate prediction.
    """
    prompt = (
        "You are an AI assistant specialized in predictive maintenance analysis. "
        "Your task is to analyze machine operational data and determine if a failure is imminent. "
        "You must follow a strict three-step process to arrive at your final prediction.\n\n"
        
        "# Step 1: Data Analysis\n"
        "Analyze the following data points for anomalies or trends that may indicate a failure:\n"
        f"- Air Temp: {features['Air temperature [K]']} K\n"
        f"- Process Temp: {features['Process temperature [K]']} K\n"
        f"- Rotational Speed: {features['Rotational speed [rpm]']} RPM\n"
        f"- Torque: {features['Torque [Nm]']} Nm\n"
        f"- Tool Wear: {features['Tool wear [min]']} min\n"
        "Pay special attention to high values in Torque, Tool Wear, and low values in Rotational Speed. "
        "Also note any active failure modes (TWF, HDF, PWF, OSF, RNF) as they are direct indicators.\n\n"
        
        "# Step 2: Reasoning\n"
        "Based on your analysis, explain your reasoning in a concise paragraph. "
        "State whether the data points suggest a normal operation or show signs of stress. "
        "Reference specific values that support your conclusion.\n\n"
        
        "# Step 3: Final Prediction\n"
        "State your final prediction clearly. The prediction must be a single digit, either '1' for an imminent failure or '0' for no imminent failure. "
        "Do not include any other text or characters in this final line. "
        "Final Prediction:"
    )
    return prompt

def predict_expert(row):
    prompt = create_expert_prompt(row)
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)
    
    # Generate a longer response to allow for multi-step reasoning
    outputs = model.generate(**inputs, max_length=200, num_beams=5, early_stopping=True)
    pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Robust parsing: Find the last line and extract the digit
    lines = pred_text.strip().split('\n')
    for line in reversed(lines):
        if 'Final Prediction:' in line:
            final_pred = line.split(':')[-1].strip()
            if '1' in final_pred:
                return 1
            else:
                return 0
    return 0 # Default return if parsing fails

In [None]:
"""# 4. Baseline Evaluation (Prompt-based)
y_pred_prompt = [predict_prompt(X_test.iloc[i]) for i in range(50)]  # small batch for demo
y_true_prompt = y_test.iloc[:50].tolist()

print("\nüìä Prompt-based Performance (Zero-shot):")
print("Accuracy:", np.mean(np.array(y_true_prompt) == np.array(y_pred_prompt)))
print("\nClassification Report:\n", classification_report(y_true_prompt, y_pred_prompt))"""

# 4. Baseline Evaluation (Prompt-based)
y_pred_prompt = [predict_prompt_improved(X_test.iloc[i]) for i in range(50)]
y_true_prompt = y_test.iloc[:50].tolist()

print("\nüìä Prompt-based Performance (Zero-shot):")
print("Accuracy:", np.mean(np.array(y_true_prompt) == np.array(y_pred_prompt)))
print("\nClassification Report:\n", classification_report(y_true_prompt, y_pred_prompt))

In [None]:
# -------------------------------
# 3. Expert-level Prompting and Prediction
# -------------------------------
def create_expert_prompt(features):
    """
    Expert-level prompt for zero-shot machine failure prediction.
    """
    prompt = (
        "You are an AI assistant specialized in predictive maintenance analysis. "
        "Your task is to analyze machine operational data and determine if a failure is imminent. "
        "You must follow a strict three-step process to arrive at your final prediction.\n\n"
        
        "# Step 1: Data Analysis\n"
        "Analyze the following data points for anomalies or trends that may indicate a failure:\n"
        f"- Air Temp: {features['Air temperature [K]']} K\n"
        f"- Process Temp: {features['Process temperature [K]']} K\n"
        f"- Rotational Speed: {features['Rotational speed [rpm]']} RPM\n"
        f"- Torque: {features['Torque [Nm]']} Nm\n"
        f"- Tool Wear: {features['Tool wear [min]']} min\n"
        f"- UDI: {features['UDI']}\n"
        f"- Product ID: {features['Product ID']}\n"
        f"- Type: {features['Type']}\n"
        f"- TWF: {features['TWF']}\n"
        f"- HDF: {features['HDF']}\n"
        f"- PWF: {features['PWF']}\n"
        f"- OSF: {features['OSF']}\n"
        f"- RNF: {features['RNF']}\n"
        "Pay special attention to high values in Torque, Tool Wear, and low values in Rotational Speed. "
        "Also note any active failure modes (TWF, HDF, PWF, OSF, RNF) as they are direct indicators.\n\n"
        
        "# Step 2: Reasoning\n"
        "Based on your analysis, explain your reasoning in a concise paragraph. "
        "State whether the data points suggest a normal operation or show signs of stress. "
        "Reference specific values that support your conclusion.\n\n"
        
        "# Step 3: Final Prediction\n"
        "State your final prediction clearly. The prediction must be a single digit, either '1' for an imminent failure or '0' for no imminent failure. "
        "Do not include any other text or characters in this final line. "
        "Final Prediction:"
    )
    return prompt

def predict_expert(row):
    prompt = create_expert_prompt(row)
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)
    
    outputs = model.generate(**inputs, max_length=200, num_beams=5, early_stopping=True)
    pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    lines = pred_text.strip().split('\n')
    for line in reversed(lines):
        if 'Final Prediction:' in line:
            final_pred = line.split(':')[-1].strip()
            if '1' in final_pred:
                return 1
            else:
                return 0
    return 0

# -------------------------------
# 4. Evaluation
# -------------------------------
sample_size = 50
y_pred_expert = [predict_expert(X_test.iloc[i]) for i in range(sample_size)]
y_true_expert = y_test.iloc[:sample_size].tolist()

print("\nüìä Expert Prompt Performance (Zero-shot):")
accuracy = accuracy_score(y_true_expert, y_pred_expert)
print(f"Accuracy: {accuracy:.4f}")

# ‚úÖ Add Classification Report
from sklearn.metrics import classification_report
print("\nClassification Report:\n", classification_report(y_true_expert, y_pred_expert))

# -------------------------------
# 5. Live User Input & Prediction
# -------------------------------
print("\n--- Live Machine Failure Prediction ---")
print("Please enter the machine's current operational data:")

try:
    user_features = {}
    user_features['UDI'] = int(input("UDI: "))
    user_features['Product ID'] = input("Product ID: ")
    user_features['Type'] = input("Type (e.g., L, M, H): ")
    user_features['Air temperature [K]'] = float(input("Air temperature [K]: "))
    user_features['Process temperature [K]'] = float(input("Process temperature [K]: "))
    user_features['Rotational speed [rpm]'] = int(input("Rotational speed [rpm]: "))
    user_features['Torque [Nm]'] = float(input("Torque [Nm]: "))
    user_features['Tool wear [min]'] = float(input("Tool wear [min]: "))
    user_features['TWF'] = int(input("TWF (Tool Wear Failure, 0 or 1): "))
    user_features['HDF'] = int(input("HDF (Heat Dissipation Failure, 0 or 1): "))
    user_features['PWF'] = int(input("PWF (Power Failure, 0 or 1): "))
    user_features['OSF'] = int(input("OSF (Overstrain Failure, 0 or 1): "))
    user_features['RNF'] = int(input("RNF (Random Failure, 0 or 1): "))
    
    # Convert user input to a pandas Series to match the model's expectation
    user_row = pd.Series(user_features)
    
    # Make the prediction
    prediction = predict_expert(user_row)
    
    print("\n--- Prediction Result ---")
    if prediction == 1:
        print("üî¥ Prediction: The model predicts an **imminent machine failure**.")
    else:
        print("üü¢ Prediction: The model predicts **no imminent failure**.")

except ValueError:
    print("‚ùå Invalid input! Please ensure you enter the correct data type (numbers for numeric fields, etc.).")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [None]:
!pip install timesfm

In [None]:
from sklearn.metrics import classification_report
from timesfm import TimesFM
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np

# Load TimesFM model
timesfm_model = TimesFM()
timesfm_model.fit(X_train, y_train)
y_pred_timesfm = timesfm_model.predict(X_test)

# Load prompt-based model
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def predict_prompt(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    prediction = outputs.logits.argmax(dim=-1).item()
    return prediction

y_pred_prompt = [predict_prompt(X_test.iloc[i]) for i in range(50)]
y_true_prompt = y_test.iloc[:50].tolist()

# Evaluate TimesFM model
print("üìä TimesFM Model Performance:")
print("Accuracy:", np.mean(y_test == y_pred_timesfm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_timesfm))

# Evaluate prompt-based model
print("\nüìä Prompt-based Model Performance:")
print("Accuracy:", np.mean(np.array(y_true_prompt) == np.array(y_pred_prompt)))
print("\nClassification Report:\n", classification_report(y_true_prompt, y_pred_prompt))

In [None]:
# -------------------------------
# TimesFM-based model block
# -------------------------------
import torch
from transformers import TimesFmModelForPrediction

# Load TimesFM pretrained model
timesfm_model = TimesFmModelForPrediction.from_pretrained(
    "google/timesfm-2.0-500m-pytorch",
    torch_dtype=torch.float32,  # or bfloat16 if your GPU supports
    device_map="auto"
)
timesfm_model.to(device)

def predict_with_timesfm(history_series, freq, threshold):
    """
    history_series: a 1D numpy / list / tensor of past values relevant to failure signal
    freq: frequency index if needed
    threshold: a numeric threshold beyond which we say failure is likely
    Returns: 1 if failure is predicted, else 0
    """
    # Prepare input: convert to tensor
    past = torch.tensor(history_series, dtype=torch.float32).to(timesfm_model.device)
    freq_tensor = torch.tensor([freq], dtype=torch.long).to(timesfm_model.device)
    # If needed batch dimension
    past = past.unsqueeze(0)  # shape (batch=1, sequence_length)
    
    with torch.no_grad():
        out = timesfm_model(past_values=past, freq=freq_tensor, return_dict=True)
        # Get forecast: maybe the mean_predictions or so
        mean_pred = out.mean_predictions  # shape (batch, horizon_length)
        # Convert to CPU, numpy
        mean_np = mean_pred.cpu().numpy()[0]
        
    # Now derive classification: if any forecasted value beyond threshold ‚Üí failure
    if (mean_np > threshold).any():
        return 1
    else:
        return 0

# Evaluate on test set
y_pred_timesfm = []
y_true_timesfm = []

# Assume you have some way of getting "history_series" for each test sample and freq
# e.g., maybe you pick some numeric feature(s) that indicate machine health over time

for i in range(len(X_test)):
    # Example: pick "Tool wear" or combination of features forming a time-series; 
    # **you will need to decide what constitutes the ‚Äúseries‚Äù** for your problem
    # For simplicity assume one feature that evolves over time is available
    
    history = ...  # your code to get past values for this instance
    freq = 0  # or some frequency encoding
    thresh = SOME_THRESHOLD  # you define, via domain knowledge or tuning
    
    pred = predict_with_timesfm(history, freq, thresh)
    y_pred_timesfm.append(pred)
    y_true_timesfm.append(y_test.iloc[i])

from sklearn.metrics import classification_report, accuracy_score
print("\nüìä TimesFM-based model performance:")
print("Accuracy:", accuracy_score(y_true_timesfm, y_pred_timesfm))
print("Classification Report:\n", classification_report(y_true_timesfm, y_pred_timesfm))

Fine Tune 

In [None]:
# -------------------------------
# 5. Fine-tuning Chronos
# -------------------------------
# Convert dataset into text ‚Üí text format
train_texts = [
    create_prompt(X_train.iloc[i], y_train.iloc[i]) for i in range(len(X_train))
]
test_texts = [
    create_prompt(X_test.iloc[i], y_test.iloc[i]) for i in range(len(X_test))
]

# Dataset wrapper
class PromptDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

train_dataset = PromptDataset(train_texts, tokenizer)
test_dataset = PromptDataset(test_texts, tokenizer)

# Training args
training_args = TrainingArguments(
    output_dir="./chronos-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

print("\nüöÄ Fine-tuning Chronos...")
trainer.train()

# -------------------------------
# 6. Evaluation after fine-tuning
# -------------------------------
def evaluate_model(trainer, texts, labels):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to(device)
    outputs = model.generate(**inputs, max_length=50)
    preds = [tokenizer.decode(o, skip_special_tokens=True) for o in outputs]

    y_pred = [1 if "1" in p else 0 for p in preds]
    acc = np.mean(np.array(labels) == np.array(y_pred))
    return acc

acc_finetuned = evaluate_model(trainer, test_texts[:50], y_test.iloc[:50].tolist())
print("\nüìä Fine-tuned Accuracy:", acc_finetuned)