<a href="https://colab.research.google.com/github/SebastianJoa/Conversor/blob/main/Traductor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --- Instalación de librerías ---
!pip install tensorflow tensorflow-datasets librosa gtts transformers tensorflowjs

Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting tensorflowjs
  Downloading tensorflowjs-4.22.0-py3-none-any.whl.metadata (3.2 kB)
Collecting click<8.2,>=7.1 (from gtts)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting packaging (from tensorflow)
  Downloading packaging-23.2-py3-none-any.whl.metadata (3.2 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downloading tensorflowjs-4.22.0-py3-none-any.whl (89 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.1/89.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading packaging-23.2-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: packaging, click, g

In [1]:
# --- Importaciones ---
import os
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import librosa
from gtts import gTTS
from transformers import pipeline

In [2]:
# --- Carpeta de salida ---
EXPORT_FOLDER = "/content/carpeta_salida"
TFJS_FOLDER = os.path.join(EXPORT_FOLDER, "tfjs_model")
os.makedirs(TFJS_FOLDER, exist_ok=True)

In [4]:
# --- Cargar el dataset de ejemplo (Speech Commands) ---
dataset_name = "speech_commands"
(ds_train, ds_test), ds_info = tfds.load(dataset_name, split=["train", "test"], shuffle_files=True, with_info=True, as_supervised=True)



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/speech_commands/0.0.3...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/speech_commands/incomplete.MNHYJU_0.0.3/speech_commands-train.tfrecord*...…

Generating validation examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/speech_commands/incomplete.MNHYJU_0.0.3/speech_commands-validation.tfrecor…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/speech_commands/incomplete.MNHYJU_0.0.3/speech_commands-test.tfrecord*...:…



Dataset speech_commands downloaded and prepared to /root/tensorflow_datasets/speech_commands/0.0.3. Subsequent calls will reuse this data.


In [5]:
# --- Función para extraer MFCCs ---
def extract_features(audio, sr=16000, max_len=40):
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20).T
    # Asegurar que todas las secuencias tengan el mismo largo
    if len(mfcc) < max_len:
        pad_width = max_len - len(mfcc)
        mfcc = np.pad(mfcc, ((0, pad_width), (0, 0)), mode='constant')
    else:
        mfcc = mfcc[:max_len, :]
    return mfcc

In [6]:
# --- Preparar arrays (versión corregida) ---
X_train, y_train = [], []
for audio, label in tfds.as_numpy(ds_train.take(500)):  # tomar solo 500 ejemplos para demo
    # Convertir el audio a numpy array y asegurar que es float32
    audio_np = audio.astype(np.float32)

    # Extraer características MFCC
    features = extract_features(audio_np)

    X_train.append(features)
    y_train.append(label)
X_train = np.array(X_train)
y_train = np.array(y_train)
print(f"Preparación completada. Dimensiones:")
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")

Preparación completada. Dimensiones:
X_train: (500, 40, 20)
y_train: (500,)


In [7]:
# --- Definir y compilar el modelo CNN para audio ---
model_cnn = tf.keras.Sequential([
    tf.keras.layers.Conv1D(32, 3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
    tf.keras.layers.MaxPooling1D(2),
    tf.keras.layers.Conv1D(64, 3, activation='relu'),
    tf.keras.layers.MaxPooling1D(2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(ds_info.features["label"].num_classes, activation='softmax')
])
model_cnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_cnn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [8]:
# --- Entrenamiento del modelo ---
model_cnn.fit(X_train, y_train, epochs=5, batch_size=32)

Epoch 1/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.2922 - loss: 60.9779
Epoch 2/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.4030 - loss: 15.1196
Epoch 3/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.4315 - loss: 6.6404
Epoch 4/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.4163 - loss: 4.7396
Epoch 5/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.4496 - loss: 3.8199


<keras.src.callbacks.history.History at 0x78a40da770e0>

In [9]:
# --- Guardar el modelo Keras ---
keras_model_path = os.path.join(EXPORT_FOLDER, "audio_model.h5")
model_cnn.save(keras_model_path)



In [10]:
# --- Exportar a TensorFlow.js ---
import tensorflowjs as tfjs
tfjs.converters.save_keras_model(model_cnn, TFJS_FOLDER)
print("Modelo TF.js exportado en:", TFJS_FOLDER)



failed to lookup keras version from the file,
    this is likely a weight only file
Modelo TF.js exportado en: /content/carpeta_salida/tfjs_model


In [11]:
# --- Inicializar modelos de reconocimiento de voz y traducción ---
asr = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2")
translator_pipeline = pipeline("translation", model="Helsinki-NLP/opus-mt-en-es")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

preprocessor_config.json: 0.00B [00:00, ?B/s]

Device set to use cpu


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [12]:
# --- Funciones para transcripción, traducción y texto a voz ---
def transcribe_audio(file_path):
    result = asr(file_path)
    return result["text"]
def translate_text(text):
    return translator_pipeline(text)[0]['translation_text']
def text_to_speech(text, filename):
    tts = gTTS(text=text, lang='es')
    tts.save(filename)
    return filename
print("Colab: Preparación completa. Modelos guardados y listos para exportar.")

Colab: Preparación completa. Modelos guardados y listos para exportar.
