In [1]:
!pip install --upgrade ml_dtypes

Collecting ml_dtypes
  Downloading ml_dtypes-0.5.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (8.9 kB)
Downloading ml_dtypes-0.5.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (4.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ml_dtypes
  Attempting uninstall: ml_dtypes
    Found existing installation: ml-dtypes 0.3.2
    Uninstalling ml-dtypes-0.3.2:
      Successfully uninstalled ml-dtypes-0.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.16.1 requires ml-dtypes~=0.3.1, but you have ml-dtypes 0.5.3 which is incompatible.
tf-keras 2.19.0 requires tensorflow<2.20,>=2.19, but you have tensorflow 2.16.1 which is incompatible.
jax 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is 

In [4]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import numpy as np
import os
from google.colab import files
import io

# Print versions to confirm
print(f"TensorFlow Version: {tf.__version__}")
print("---")

# --- DATA UPLOAD & LOAD ---
print("Please upload your overview.csv file:")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[file_name]))
print("\nData loaded successfully.")
print("---")


print("First 5 rows of the loaded data:")
print(df.head())
print("\nColumn names found in the CSV:")
print(df.columns)


# --- DATA PREPROCESSING (Final Corrected Version) ---
print("Preprocessing data...")

# Use the CORRECT column names that exist in the file.
TEXT_COLUMN = 'phrase'
LABEL_COLUMN = 'prompt'

# Get the top 5 most common prompts to use as our classes.
top_5_labels = df[LABEL_COLUMN].value_counts().nlargest(5).index
df_filtered = df[df[LABEL_COLUMN].isin(top_5_labels)].copy()

# Ensure our working dataframe has no missing values in the columns we need.
df_filtered = df_filtered[[TEXT_COLUMN, LABEL_COLUMN]].dropna()

# Encode the text labels (prompts) into numbers (0, 1, 2, 3, 4)
label_encoder = LabelEncoder()
df_filtered['label'] = label_encoder.fit_transform(df_filtered[LABEL_COLUMN])

# This label_map is the key to decoding the model's output.
label_map = {i: label for i, label in enumerate(label_encoder.classes_)}
print("--- IMPORTANT ---")
print(f"Label Mapping: {label_map}")

# Prepare the data for the model.
X = df_filtered[TEXT_COLUMN].tolist()
y = df_filtered['label'].tolist()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print("---")

# --- TOKENIZATION ---
print("Tokenizing data...")
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=128)
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).shuffle(1000).batch(16)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), y_val)).batch(16)
print("Tokenization complete.")
print("---")

# --- MODEL TRAINING ---
print("Loading and training model...")
num_labels = len(label_map)
model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
epochs = 1
model.fit(train_dataset, epochs=epochs, validation_data=val_dataset)
print("Training complete.")
print("---")

# --- TFLITE CONVERSION & DOWNLOAD ---
print("Converting model to TensorFlow Lite...")
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()
tflite_model_path = 'symptom_model.tflite'
with open(tflite_model_path, 'wb') as f:
    f.write(tflite_model)
file_size_kb = os.path.getsize(tflite_model_path) / 1024
print(f"\nSuccessfully converted and saved model to {tflite_model_path}")
print(f"File size: {file_size_kb:.2f} KB")
files.download(tflite_model_path)
print(f"\nDownload of '{tflite_model_path}' initiated.")

TensorFlow Version: 2.16.1
---
Please upload your overview.csv file:


Saving overview.csv to overview (2).csv

Data loaded successfully.
---
First 5 rows of the loaded data:
   audio_clipping  audio_clipping:confidence background_noise_audible  \
0     no_clipping                     1.0000              light_noise   
1  light_clipping                     0.6803                 no_noise   
2     no_clipping                     1.0000                 no_noise   
3     no_clipping                     1.0000              light_noise   
4     no_clipping                     1.0000                 no_noise   

   background_noise_audible:confidence  overall_quality_of_the_audio  \
0                               1.0000                          3.33   
1                               0.6803                          3.33   
2                               0.6655                          3.33   
3                               1.0000                          3.33   
4                               1.0000                          4.67   

     quiet_speaker  quie

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Tokenization complete.
---
Loading and training model...


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Cause: for/else statement not yet supported
Training complete.
---
Converting model to TensorFlow Lite...





Successfully converted and saved model to symptom_model.tflite
File size: 65716.18 KB


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Download of 'symptom_model.tflite' initiated.
