In [None]:
from google.colab import files
import io
import pandas as pd
print("Please upload your csv file.")
uploaded = files.upload()
file_name = next(iter(uploaded))
df = pd.read_csv(io.BytesIO(uploaded[file_name]))
print(f"\nSuccessfully loaded {file_name}!")
df.head()

Please upload your csv file.


Saving generated_contacts_dataset.csv to generated_contacts_dataset.csv

Successfully loaded generated_contacts_dataset.csv!


Unnamed: 0,label,text
0,address,"9636 Elm Crescent, Tokyo"
1,phone,+471 186 4212837
2,email,sales9944@service.org
3,phone,+91 6128260438
4,address,Elm Crescent 9228\n10001 Mumbai\nRussia


In [None]:
!pip install tensorflow>=2.12.0 tflite-support>=0.4.3 transformers>=4.26.0


import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import json
import pandas as pd
import os
from google.colab import files


In [None]:

!pip install -q tflite-support
!pip install -q transformers

import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import json
import pandas as pd

print(f"Original number of rows: {len(df)}")
df = df.drop_duplicates().reset_index(drop=True)
print(f"Number of rows after removing duplicates: {len(df)}")

# 1. Preprocess the Data
label2id = {label: i for i, label in enumerate(df['label'].unique())}
id2label = {i: label for label, i in label2id.items()}
df['label_id'] = df['label'].map(label2id)

X_train, X_val, y_train, y_val = train_test_split(
    df['text'], df['label_id'], test_size=0.2, random_state=42
)

# 2. Set up the Tokenizer and Datasets
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding=True, max_length=128)

train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train.tolist())).shuffle(1000).batch(16)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), y_val.tolist())).batch(16)

# 3. Set up the DistilBERT Model
num_labels = len(label2id)
model = TFDistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    from_pt=True
)

# 4. Train the Model
num_epochs = 5
num_train_steps = len(train_dataset) * num_epochs

loss = SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
model.optimizer.learning_rate.assign(1e-4)

model.fit(train_dataset, validation_data=val_dataset, epochs=num_epochs)

# === METADATA CREATION FOR ANDROID ===
print("\n--- Creating Android metadata files ---")

# Create labels.txt file
labels_content = "\n".join([id2label[i] for i in range(len(id2label))])
with open('labels.txt', 'w') as f:
    f.write(labels_content)
print("✓ Created labels.txt")

# Create vocab.txt from tokenizer
vocab = tokenizer.get_vocab()
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])
vocab_content = "\n".join([token for token, _ in sorted_vocab])
with open('vocab.txt', 'w') as f:
    f.write(vocab_content)
print("✓ Created vocab.txt")

# === CRITICAL FIX: Use BertNLClassifier-compatible tensor names ===
print("\n--- Creating wrapper model with CORRECT tensor names for BertNLClassifier ---")

class DistilBertWrapper(tf.keras.Model):
    def __init__(self, distilbert_model):
        super().__init__()
        self.distilbert = distilbert_model

    def call(self, ids, mask, segment_ids):
        # DistilBERT doesn't use segment_ids, so we ignore it
        outputs = self.distilbert({"input_ids": ids, "attention_mask": mask})
        logits = outputs.logits
        probabilities = tf.nn.softmax(logits, axis=-1)
        return probabilities

# Create wrapper instance
wrapped_model = DistilBertWrapper(model)

# CRITICAL: Define serving signature with CORRECT tensor names
@tf.function(input_signature=[
    tf.TensorSpec(shape=[1, 128], dtype=tf.int32, name="ids"),  # Changed from input_ids
    tf.TensorSpec(shape=[1, 128], dtype=tf.int32, name="mask"),  # Changed from attention_mask
    tf.TensorSpec(shape=[1, 128], dtype=tf.int32, name="segment_ids")  # Changed from token_type_ids
])
def serving_function(ids, mask, segment_ids):
    probabilities = wrapped_model(ids, mask, segment_ids)
    return {'output': probabilities}

print("✓ Model wrapper created with tensor names: 'ids', 'mask', 'segment_ids'")

# === CONVERSION CODE ===
# 5. Convert the Model to TFLite
concrete_func = serving_function.get_concrete_function()
converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]

# Convert and save without metadata first
tflite_model = converter.convert()
output_filename = 'distilbert_model_clean.tflite'

with open(output_filename, 'wb') as f:
    f.write(tflite_model)

print(f"\nModel converted successfully: {output_filename}")
print(f"Tensor names are now BertNLClassifier-compatible!")

# Verify tensor names
print("\nVerifying tensor names...")
interpreter = tf.lite.Interpreter(model_path=output_filename)
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()

print("Input tensor names:")
for i, detail in enumerate(input_details):
    print(f"  [{i}] {detail['name']}")

expected_names = ['ids', 'mask', 'segment_ids']
actual_names = [d['name'] for d in input_details]

if all(name in actual_names for name in expected_names):
    print("\nSUCCESS! All tensor names are correct!")
else:
    print("\n WARNING: Tensor names may not be correct")

# Download files in Colab
print("\n Downloading files for Android deployment...")
from google.colab import files
files.download(output_filename)
files.download('labels.txt')
files.download('vocab.txt')

Original number of rows: 10000
Number of rows after removing duplicates: 10000


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSeq

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

--- Creating Android metadata files ---
✓ Created labels.txt
✓ Created vocab.txt

--- Creating wrapper model with CORRECT tensor names for BertNLClassifier ---
✓ Model wrapper created with tensor names: 'ids', 'mask', 'segment_ids'





✅ Model converted successfully: distilbert_model_clean.tflite
✅ Tensor names are now BertNLClassifier-compatible!

Verifying tensor names...
Input tensor names:
  [0] ids
  [1] mask
  [2] segment_ids

✅ SUCCESS! All tensor names are correct!

📥 Downloading files for Android deployment...


    TF 2.20. Please use the LiteRT interpreter from the ai_edge_litert package.
    See the [migration guide](https://ai.google.dev/edge/litert/migration)
    for details.
    


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


NEXT STEPS:
1. Take the downloaded 'distilbert_model_clean.tflite' file
2. Run your metadata script on this file (it has correct tensor names)
3. The metadata script will attach BertTokenizer to the 'ids' tensor
4. Deploy to Android - it should work now!


In [None]:
# Quick test function - add this to your notebook after training
def test_model(text):
    """Quick test function for immediate testing"""
    # Tokenize the input
    encoded = tokenizer([text], truncation=True, padding=True, max_length=128, return_tensors="tf")

    # Make prediction
    prediction = model(encoded)
    logits = prediction.logits[0]

    # Get predicted class and confidence
    predicted_class_id = tf.argmax(logits).numpy()
    confidence = tf.nn.softmax(logits).numpy()

    predicted_label = id2label[predicted_class_id]
    confidence_score = confidence[predicted_class_id]

    print(f"Text: '{text}'")
    print(f"Predicted: {predicted_label}")
    print(f"Confidence: {confidence_score:.4f}")
    print("All probabilities:")
    for i, prob in enumerate(confidence):
        print(f"  {id2label[i]}: {prob:.4f}")
    print("-" * 50)

# TEST YOUR MODEL NOW!
print("=== TESTING YOUR MODEL ===\n")

# Test with various inputs
test_inputs = [
    "john.smith@gmail.com",
    "555-123-4567",
    "https://www.example.com",
    "123 Oak Street, Boston MA 02101",
    "+1-800-555-0199",
    "https://github.com/user/repo",
    "contact@company.org",
    "PO Box 789, Seattle WA"
]

for text in test_inputs:
    test_model(text)

# Interactive testing - you can modify these
print("\n=== TRY YOUR OWN INPUTS ===")
print("Change the text below and run to test:")

# CHANGE THESE TO TEST YOUR OWN INPUTS:
my_test_1 = "bhoomikasundar.cs23@rvce.edu.in"
my_test_2 = "88891-60160"
my_test_3 = "https://claude.ai/chat/deaa27fe-e7b5-4129-b015-4818da3826d5"
my_test_4 = "221 B,Basavangudi,Bengaluru"

test_model(my_test_1)
test_model(my_test_2)
test_model(my_test_3)
test_model(my_test_4)

=== TESTING YOUR MODEL ===

Text: 'john.smith@gmail.com'
Predicted: email
Confidence: 1.0000
All probabilities:
  address: 0.0000
  phone: 0.0000
  email: 1.0000
  url: 0.0000
--------------------------------------------------
Text: '555-123-4567'
Predicted: phone
Confidence: 1.0000
All probabilities:
  address: 0.0000
  phone: 1.0000
  email: 0.0000
  url: 0.0000
--------------------------------------------------
Text: 'https://www.example.com'
Predicted: url
Confidence: 1.0000
All probabilities:
  address: 0.0000
  phone: 0.0000
  email: 0.0000
  url: 1.0000
--------------------------------------------------
Text: '123 Oak Street, Boston MA 02101'
Predicted: address
Confidence: 1.0000
All probabilities:
  address: 1.0000
  phone: 0.0000
  email: 0.0000
  url: 0.0000
--------------------------------------------------
Text: '+1-800-555-0199'
Predicted: phone
Confidence: 1.0000
All probabilities:
  address: 0.0000
  phone: 1.0000
  email: 0.0000
  url: 0.0000
---------------------------