<a href="https://colab.research.google.com/github/RICHAR-SL/IA/blob/main/resumen_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Forzar reinstalación limpia



In [13]:
# # Forzar reinstalación limpia
# !pip uninstall -y transformers datasets accelerate torch torchaudio torchvision
# !pip install --no-cache-dir --upgrade pip
# !pip install --no-cache-dir --upgrade \
#   torch torchvision torchaudio \
#   transformers[torch] datasets[audio] accelerate torchcodec
# !pip install git+https://github.com/huggingface/transformers
# !pip install easyocr
# !pip install gtts
# !pip install pyphen



In [16]:
# de voz a texto
!pip install SpeechRecognition
# de img a texto
!apt-get install tesseract-ocr -y
!pip install pytesseract

Collecting SpeechRecognition
  Downloading speechrecognition-3.14.3-py3-none-any.whl.metadata (30 kB)
Downloading speechrecognition-3.14.3-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.3
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


# ***Resumen***

In [17]:
from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok
from transformers import pipeline
import speech_recognition as sr
from PIL import Image
import pytesseract
import io

app = Flask(__name__)
CORS(app)

# Pipeline para resumen
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Función para generar resumen con estadísticas
def generar_resumen(texto):
    if not texto.strip():
        raise ValueError("Texto vacío.")
    resumen = summarizer(texto, max_length=150, min_length=30, do_sample=False)
    texto_resumido = resumen[0]['summary_text']
    return {
        "resumen": texto_resumido,
        "caracteres_original": len(texto),
        "palabras_original": len(texto.split()),
        "caracteres_resumen": len(texto_resumido),
        "palabras_resumen": len(texto_resumido.split())
    }

# Ruta: /resumir
@app.route('/resumir', methods=['POST'])
def resumir():
    data = request.json
    texto = data.get('texto', '')
    try:
        resultado = generar_resumen(texto)
        return jsonify(resultado)
    except Exception as e:
        return jsonify({"error": str(e)}), 400

# Ruta: /audio
@app.route('/audio', methods=['POST'])
def audio():
    if 'audio' not in request.files:
        return jsonify({"error": "Archivo de audio no recibido."}), 400
    audio_file = request.files['audio']

    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio_data = recognizer.record(source)
        try:
            texto = recognizer.recognize_google(audio_data, language="es-ES")
            resultado = generar_resumen(texto)
            resultado['transcripcion'] = texto
            return jsonify(resultado)
        except sr.UnknownValueError:
            return jsonify({"error": "No se pudo entender el audio."}), 400
        except Exception as e:
            return jsonify({"error": str(e)}), 500

# Ruta: /ocr
@app.route('/ocr', methods=['POST'])
def ocr():
    if 'imagen' not in request.files:
        return jsonify({"error": "Archivo de imagen no recibido."}), 400
    imagen_file = request.files['imagen']
    try:
        image = Image.open(io.BytesIO(imagen_file.read()))
        texto = pytesseract.image_to_string(image, lang='spa')
        resultado = generar_resumen(texto)
        resultado['texto_detectado'] = texto
        return jsonify(resultado)
    except Exception as e:
        return jsonify({"error": str(e)}), 500



Device set to use cuda:0


Tu app está corriendo en: NgrokTunnel: "https://395ea9bf1ae9.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


# ***EXTRAER LETRA MUSICA***


In [11]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

# Selección de dispositivo
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Modelo
model_id = "openai/whisper-large-v3"

# Cargar modelo y procesador
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True,
).to(device)

processor = AutoProcessor.from_pretrained(model_id)

# Crear pipeline con timestamps
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=0 if torch.cuda.is_available() else -1,
    return_timestamps=True,  # Esto activa el modo long-form
)

# Transcribir archivo MP3 largo
audio_path = "/content/Luna - Zoé (Letra_Lyrics) (Unplugged)(MP3_160K).mp3"
result = pipe(audio_path)

# Imprimir resultado
print("🎧 Transcripción:")
if isinstance(result, dict):
    print(result["text"])
else:
    for chunk in result:
        print(f"[{chunk['timestamp'][0]:.2f} - {chunk['timestamp'][1]:.2f}] {chunk['text']}")


Device set to use cuda:0
Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


🎧 Transcripción:
 Thank you so much for watching, and I'll see you in the next video. Entiendo que no puedo suplicarle una vez más Pero nada se detiene, solo vivo para ti Dame solo un beso que me alcance hasta morir Como un vicio que me duele Quiero mirarte a los ojos Y cuando te me acercas Se acelera mi motor Me da fiebre Me hago fuego Y me vuelvo a consumir Dame solo un beso Que me alcance hasta morir Con un vicio que me duele Quiero mirarte a los ojos Oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, oh, o

# ***Texto DE IMG***

In [3]:
import cv2
from PIL import Image
import numpy as np

# Leer imagen
img_path = '/content/20250815_113804.jpg'
img = cv2.imread(img_path)

# Convertir a escala de grises
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Aplicar umbral adaptativo
thresh = cv2.adaptiveThreshold(
    gray, 255,
    cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
    cv2.THRESH_BINARY,
    11, 2
)

# Guardar la imagen procesada
cv2.imwrite('/content/preprocesada.jpg', thresh)
Image.open('/content/preprocesada.jpg').show()


In [4]:
import easyocr
reader = easyocr.Reader(['es'])
result = reader.readtext('/content/preprocesada.jpg')
texto = " ".join([res[1] for res in result])
print("Texto extraído:", texto)
# Corrección
corrector = pipeline('text2text-generation', model='t5-base')
texto_corregido = corrector(f"corregir: {texto}")[0]['generated_text']
print("\nTexto corregido:")
print(texto_corregido)

# Paráfrasis
parafraseador = pipeline("text2text-generation", model="Vamsi/T5_Paraphrase_Paws")
texto_parafraseado = parafraseador(texto_corregido)[0]['generated_text']

# Mostrar texto parafraseado y audio
display(Markdown(f"### Texto parafraseado:\n\n{texto_parafraseado}"))

tts = gTTS(text=texto_parafraseado, lang='es')
tts.save('salida_audio.mp3')
display(Audio('salida_audio.mp3', autoplay=False))

Texto extraído: BRBsK? '8H Sfsal #o BRgSKe 2kBH RkOmonicl 7Yo B4l bo0o 26 Sersal o BRBs KG 2K 8N Sevùal 'N BR  SKGZK P w Pnmova  a 4408 4oooo 30 NK


Device set to use cuda:0



Texto corregido:
o BRgSKe 2kBH RkOmonicl 7Yo B4l bo0o 26 Sersal o BRgSKe 2kBH RkOmonicl 7Yo B4l bo0o 26 Sersal o BRgSKe 2kBH RkOmonicl 7Yo B4l bo0o 26 Sersal o BRgSKe 2kBH RkOmonicl 7Yo B4l bo0o


Device set to use cuda:0


### Texto parafraseado:

Sersal o BRgSKe 2kBH RkOmonicl 7Yo B4l bo0o 26 Sersal o BRgSKe 2kBH RkOmonicl 7Yo B4l bo0o 26 Sersal o BRgSKe 2kBH RkOmonicl 7Yo B4l bo0o 26 Sersal o BRgSKe 2kBH RkOmonicl 7Yo B4l bo

# ***HTMLSSAS***

In [6]:
!pip install flask flask-cors pyngrok


Collecting flask-cors
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Downloading flask_cors-6.0.1-py3-none-any.whl (13 kB)
Downloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok, flask-cors
Successfully installed flask-cors-6.0.1 pyngrok-7.3.0


In [8]:
!ngrok config add-authtoken 31SvM5OXEuo6RytCNnvo6a5eoC5_7pzrSkHYayLugRygJyR1q

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [10]:
from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok

app = Flask(__name__)
CORS(app)  # Para permitir peticiones desde tu HTML

from flask import send_file

@app.route('/')
def home():
    return send_file('index.html')

# Ruta de resumen de texto
@app.route('/resumir', methods=['POST'])
def resumir():
    data = request.json
    texto = data.get('texto', '')
    resumen = f"Resumen de ejemplo del texto: {texto[:50]}..."  # Aquí iría tu modelo
    return jsonify({"resumen": resumen})

# Ruta para OCR (texto desde imagen)
@app.route('/ocr', methods=['POST'])
def ocr():
    image = request.files['imagen']
    texto = f"Texto detectado de imagen: ejemplo"  # Aquí iría tu modelo OCR
    return jsonify({"texto": texto})

# Ruta para transcripción de audio
@app.route('/audio', methods=['POST'])
def transcribir():
    audio = request.files['audio']
    letra = "Letra de ejemplo extraída del audio"  # Aquí iría tu modelo STT
    return jsonify({"letra": letra})

# Iniciar ngrok y mostrar URL pública
public_url = ngrok.connect(5000)
print(f"Tu app está corriendo en: {public_url}")

# Iniciar servidor
app.run(port=5000)


Tu app está corriendo en: NgrokTunnel: "https://87bb23b3851e.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [18/Aug/2025 15:26:57] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2025 15:26:58] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2025 15:28:44] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2025 15:33:33] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2025 15:33:45] "POST /resumir HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2025 15:34:02] "POST /audio HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2025 15:34:09] "[31m[1mPOST /ocr HTTP/1.1[0m" 400 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2025 15:34:47] "POST /ocr HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2025 15:35:06] "POST /audio HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2025 15:39:08] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2025 15:39:17] "POST /ocr HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [18/Aug/2025 