In [None]:
# Asegúrate de ejecutar esto en un entorno de Google Colab con GPU

!pip install transformers torch bitsandbytes accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using c

In [None]:
pip install torch transformers psutil bitsandbytes accelerate



In [None]:
!pip install -U bitsandbytes
!pip install -U accelerate
!pip install -U transformers



Cargando modelo: EleutherAI/gpt-j-6B


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


ImportError: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [None]:
#sin cuantizar
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
import psutil
import os

def check_gpu():
    if torch.cuda.is_available():
        print(f"GPU disponible: {torch.cuda.get_device_name(0)}")
        print(f"Memoria GPU total: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    else:
        print("GPU no disponible. Usando CPU.")

def load_model(model_name, use_half_precision=False, use_8bit=False, device_map="auto"):
    print(f"Cargando modelo: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if use_8bit:
        try:
            from transformers import BitsAndBytesConfig
            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
            model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map=device_map)
        except ImportError:
            print("Error: No se pudo importar BitsAndBytesConfig. Asegúrate de tener instalada la última versión de transformers y bitsandbytes.")
            return None, None
    else:
        model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, torch_dtype=torch.float16 if use_half_precision else torch.float32, device_map=device_map)

    if not use_8bit and use_half_precision and torch.cuda.is_available():
        model = model.half()

    print(f"Modelo cargado con configuración: 8-bit={use_8bit}, half precision={use_half_precision}, device map={device_map}")
    return tokenizer, model

def generate_text(model, tokenizer, prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            do_sample=True,
            temperature=0.7
        )
    end_time = time.time()
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text, end_time - start_time

def measure_memory_usage():
    if torch.cuda.is_available():
        return torch.cuda.max_memory_allocated() / (1024 ** 3)  # Convert to GB
    else:
        return psutil.Process(os.getpid()).memory_info().rss / (1024 ** 3)  # Convert to GB

def run_test(model_name="EleutherAI/gpt-j-6B", prompt="¿Qué es el sol?", use_half_precision=False, use_8bit=False, device_map="auto"):
    check_gpu()
    start_memory = measure_memory_usage()

    tokenizer, model = load_model(model_name, use_half_precision, use_8bit, device_map)
    if model is None:
        return

    model_load_memory = measure_memory_usage() - start_memory

    print(f"\nGenerando texto para el prompt: '{prompt}'")
    generated_text, generation_time = generate_text(model, tokenizer, prompt)

    total_memory = measure_memory_usage() - start_memory

    print(f"Texto generado: {generated_text}")
    print(f"Tiempo de generación: {generation_time:.2f} segundos")
    print(f"Memoria usada para cargar el modelo: {model_load_memory:.2f} GB")
    print(f"Memoria total usada: {total_memory:.2f} GB")

    del model
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

# Ejemplo de uso
run_test(use_half_precision=True, use_8bit=False)

GPU disponible: Tesla T4
Memoria GPU total: 15.84 GB
Cargando modelo: EleutherAI/gpt-j-6B


pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

Modelo cargado con configuración: 8-bit=False, half precision=True, device map=auto

Generando texto para el prompt: '¿Qué es el sol?'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Texto generado: ¿Qué es el sol?

El sol es la energía que nos brinda la luz del día. Todo lo que podemos ver y todo lo que podemos hacer, lo hacem
Tiempo de generación: 5.39 segundos
Memoria usada para cargar el modelo: 11.38 GB
Memoria total usada: 11.45 GB


In [None]:
#!pip install whisperplus
#!pip install bitsandbytes
#!pip install -U bitsandbytes
!pip uninstall bitsandbytes
!ypip install bitsandbytes


Found existing installation: bitsandbytes 0.43.1
Uninstalling bitsandbytes-0.43.1:
  Would remove:
    /usr/local/lib/python3.10/dist-packages/bitsandbytes-0.43.1.dist-info/*
    /usr/local/lib/python3.10/dist-packages/bitsandbytes/*
    /usr/local/lib/python3.10/dist-packages/tests/*
Proceed (Y/n)? y
  Successfully uninstalled bitsandbytes-0.43.1
Collecting bitsandbytes
  Using cached bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.1


In [None]:
#cuantizado
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
import psutil
import os

def check_gpu():
    if torch.cuda.is_available():
        print(f"GPU disponible: {torch.cuda.get_device_name(0)}")
        print(f"Memoria GPU total: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    else:
        print("GPU no disponible. Usando CPU.")

def load_model(model_name, use_half_precision=False, use_8bit=False, device_map="auto"):
    print(f"Cargando modelo: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if use_8bit:
        try:
            from transformers import BitsAndBytesConfig
            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
            model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map=device_map)
        except ImportError:
            print("Error: No se pudo importar BitsAndBytesConfig. Asegúrate de tener instalada la última versión de transformers y bitsandbytes.")
            return None, None
    else:
        model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, torch_dtype=torch.float16 if use_half_precision else torch.float32, device_map=device_map)

    if not use_8bit and use_half_precision and torch.cuda.is_available():
        model = model.half()

    print(f"Modelo cargado con configuración: 8-bit={use_8bit}, half precision={use_half_precision}, device map={device_map}")
    return tokenizer, model

def generate_text(model, tokenizer, prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            do_sample=True,
            temperature=0.7
        )
    end_time = time.time()
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text, end_time - start_time

def measure_memory_usage():
    if torch.cuda.is_available():
        return torch.cuda.max_memory_allocated() / (1024 ** 3)  # Convert to GB
    else:
        return psutil.Process(os.getpid()).memory_info().rss / (1024 ** 3)  # Convert to GB

def run_test(model_name="EleutherAI/gpt-j-6B", prompt="¿Qué es el sol?", use_half_precision=False, use_8bit=False, device_map="auto"):
    check_gpu()
    start_memory = measure_memory_usage()

    tokenizer, model = load_model(model_name, use_half_precision, use_8bit, device_map)
    if model is None:
        return

    model_load_memory = measure_memory_usage() - start_memory

    print(f"\nGenerando texto para el prompt: '{prompt}'")
    generated_text, generation_time = generate_text(model, tokenizer, prompt)

    total_memory = measure_memory_usage() - start_memory

    print(f"Texto generado: {generated_text}")
    print(f"Tiempo de generación: {generation_time:.2f} segundos")
    print(f"Memoria usada para cargar el modelo: {model_load_memory:.2f} GB")
    print(f"Memoria total usada: {total_memory:.2f} GB")

    del model
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

# Ejemplo de uso
run_test(use_half_precision=True, use_8bit=True)

GPU disponible: Tesla T4
Memoria GPU total: 15.84 GB
Cargando modelo: EleutherAI/gpt-j-6B


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Modelo cargado con configuración: 8-bit=True, half precision=True, device map=auto

Generando texto para el prompt: '¿Qué es el sol?'
Texto generado: ¿Qué es el sol? Es una de las preguntas más frecuentes que le hacemos a nuestros hijos. La respuesta es sencilla: el sol es el ag
Tiempo de generación: 9.38 segundos
Memoria usada para cargar el modelo: 5.96 GB
Memoria total usada: 5.96 GB


In [None]:
pip install transformers==4.42.4



In [None]:
pip install bitsandbytes==0.43.1



In [None]:
!pip install -U transformers bitsandbytes




In [None]:
!pip uninstall bitsandbytes
!pip install bitsandbytes


Found existing installation: bitsandbytes 0.43.1
Uninstalling bitsandbytes-0.43.1:
  Would remove:
    /usr/local/lib/python3.10/dist-packages/bitsandbytes-0.43.1.dist-info/*
    /usr/local/lib/python3.10/dist-packages/bitsandbytes/*
    /usr/local/lib/python3.10/dist-packages/tests/*
Proceed (Y/n)? y
  Successfully uninstalled bitsandbytes-0.43.1
Collecting bitsandbytes
Y
  Using cached bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)

Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.1


In [None]:
!pip install transformers peft torch




In [None]:
!pip install -U peft


Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/251.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m245.8/251.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.13.0->peft)
  Using cached nvi

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
import time
import psutil
import os

def check_gpu():
    if torch.cuda.is_available():
        print(f"GPU disponible: {torch.cuda.get_device_name(0)}")
        print(f"Memoria GPU total: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    else:
        print("GPU no disponible. Usando CPU.")

def load_model_with_peft(model_name, use_half_precision=False, device_map="auto"):
    print(f"Cargando modelo: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map)

    if use_half_precision and torch.cuda.is_available():
        model = model.half()

    print(f"Modelo cargado con configuración: half precision={use_half_precision}, device map={device_map}")

    # Aplicar PEFT
    lora_config = LoraConfig(r=8, lora_alpha=32, lora_dropout=0.1)  # Configuración de LoRA
    model = get_peft_model(model, lora_config)

    print("PEFT y LoRA aplicados.")
    return tokenizer, model

def generate_text(model, tokenizer, prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            do_sample=True,
            temperature=0.7
        )
    end_time = time.time()
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text, end_time - start_time

def measure_memory_usage():
    if torch.cuda.is_available():
        return torch.cuda.max_memory_allocated() / (1024 ** 3)  # Convert to GB
    else:
        return psutil.Process(os.getpid()).memory_info().rss / (1024 ** 3)  # Convert to GB

def run_test(model_name="distilgpt2", prompt="¿Qué es el sol?", use_half_precision=False, device_map="auto"):
    check_gpu()
    start_memory = measure_memory_usage()

    tokenizer, model = load_model_with_peft(model_name, use_half_precision, device_map)

    model_load_memory = measure_memory_usage() - start_memory

    print(f"\nGenerando texto para el prompt: '{prompt}'")
    generated_text, generation_time = generate_text(model, tokenizer, prompt)

    total_memory = measure_memory_usage() - start_memory

    print(f"Texto generado: {generated_text}")
    print(f"Tiempo de generación: {generation_time:.2f} segundos")
    print(f"Memoria usada para cargar el modelo: {model_load_memory:.2f} GB")
    print(f"Memoria total usada: {total_memory:.2f} GB")

    del model
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

# Ejemplo de uso
run_test(use_half_precision=True)


GPU disponible: Tesla T4
Memoria GPU total: 15.84 GB
Cargando modelo: distilgpt2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Modelo cargado con configuración: half precision=True, device map=auto
PEFT y LoRA aplicados.

Generando texto para el prompt: '¿Qué es el sol?'


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Texto generado: ¿Qué es el sol? ¿Qué est aút que su habre una con novo una que quando que lele esse de la pueda, pero una habre a parte
Tiempo de generación: 2.98 segundos
Memoria usada para cargar el modelo: 0.38 GB
Memoria total usada: 0.38 GB


In [None]:
#cuantizado
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
import psutil
import os
from transformers import BitsAndBytesConfig

def check_gpu():
    if torch.cuda.is_available():
        print(f"GPU disponible: {torch.cuda.get_device_name(0)}")
        print(f"Memoria GPU total: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    else:
        print("GPU no disponible. Usando CPU.")

def load_model(model_name, use_half_precision=False, use_8bit=False, device_map="auto"):
    print(f"Cargando modelo: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if use_8bit:
        try:
            from transformers import BitsAndBytesConfig
            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
            model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map=device_map)
        except ImportError:
            print("Error: No se pudo importar BitsAndBytesConfig. Asegúrate de tener instalada la última versión de transformers y bitsandbytes.")
            return None, None
    else:
        model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, torch_dtype=torch.float16 if use_half_precision else torch.float32, device_map=device_map)

    if not use_8bit and use_half_precision and torch.cuda.is_available():
        model = model.half()

    print(f"Modelo cargado con configuración: 8-bit={use_8bit}, half precision={use_half_precision}, device map={device_map}")
    return tokenizer, model

def generate_text(model, tokenizer, prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            do_sample=True,
            temperature=0.7
        )
    end_time = time.time()
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text, end_time - start_time

def measure_memory_usage():
    if torch.cuda.is_available():
        return torch.cuda.max_memory_allocated() / (1024 ** 3)  # Convert to GB
    else:
        return psutil.Process(os.getpid()).memory_info().rss / (1024 ** 3)  # Convert to GB

def run_test(model_name="EleutherAI/gpt-j-6B", prompt="¿Qué es el sol?", use_half_precision=False, use_8bit=False, device_map="auto"):
    check_gpu()
    start_memory = measure_memory_usage()

    tokenizer, model = load_model(model_name, use_half_precision, use_8bit, device_map)
    if model is None:
        return

    model_load_memory = measure_memory_usage() - start_memory

    print(f"\nGenerando texto para el prompt: '{prompt}'")
    generated_text, generation_time = generate_text(model, tokenizer, prompt)

    total_memory = measure_memory_usage() - start_memory

    print(f"Texto generado: {generated_text}")
    print(f"Tiempo de generación: {generation_time:.2f} segundos")
    print(f"Memoria usada para cargar el modelo: {model_load_memory:.2f} GB")
    print(f"Memoria total usada: {total_memory:.2f} GB")

    del model
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

# Ejemplo de uso
run_test(use_half_precision=True, use_8bit=True)


GPU disponible: Tesla T4
Memoria GPU total: 15.84 GB
Cargando modelo: EleutherAI/gpt-j-6B
Error: No se pudo importar BitsAndBytesConfig. Asegúrate de tener instalada la última versión de transformers y bitsandbytes.


In [None]:
def load_model(model_name, use_half_precision=False, use_8bit=False, device_map="auto"):
    print(f"Cargando modelo: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if use_8bit:
        try:
            import bitsandbytes as bnb
            print(f"bitsandbytes version: {bnb.__version__}")
            from transformers import BitsAndBytesConfig
            print("BitsAndBytesConfig imported successfully")
            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
            model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map=device_map)
        except ImportError as e:
            print(f"Error detallado: {str(e)}")
            print("Error: No se pudo importar BitsAndBytesConfig o bitsandbytes.")
            return None, None
    else:
        model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, torch_dtype=torch.float16 if use_half_precision else torch.float32, device_map=device_map)

    if not use_8bit and use_half_precision and torch.cuda.is_available():
        model = model.half()

    print(f"Modelo cargado con configuración: 8-bit={use_8bit}, half precision={use_half_precision}, device map={device_map}")
    return tokenizer, model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
import psutil
import os

def load_model(model_name):
    print(f"Cargando modelo: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    print(f"Modelo cargado en dispositivo: {device}")
    return tokenizer, model

def generate_text(model, tokenizer, prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            do_sample=True,
            temperature=0.7
        )
    end_time = time.time()
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text, end_time - start_time

def measure_memory_usage():
    if torch.cuda.is_available():
        return torch.cuda.max_memory_allocated() / (1024 ** 3)  # Convert to GB
    else:
        return psutil.Process(os.getpid()).memory_info().rss / (1024 ** 3)  # Convert to GB

def run_test(model_name, prompt):
    start_memory = measure_memory_usage()

    tokenizer, model = load_model(model_name)

    model_load_memory = measure_memory_usage() - start_memory

    print(f"\nGenerando texto para el prompt: '{prompt}'")
    generated_text, generation_time = generate_text(model, tokenizer, prompt)

    total_memory = measure_memory_usage() - start_memory

    print(f"Texto generado: {generated_text}")
    print(f"Tiempo de generación: {generation_time:.2f} segundos")
    print(f"Memoria usada para cargar el modelo: {model_load_memory:.2f} GB")
    print(f"Memoria total usada: {total_memory:.2f} GB")

    del model
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

# Configuración y ejecución de la prueba
model_name = "EleutherAI/gpt-j-6B"
prompt = "¿Qué es el sol?"
run_test(model_name, prompt)

Cargando modelo: EleutherAI/gpt-j-6B


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
#model_name = "EleutherAI/gpt-j-6B
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time
import psutil
import os

def get_memory_usage():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 ** 2  # en MB

def load_model(model_name, quantized=False):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if quantized:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            load_in_8bit=True,
        )
    else:
        model = AutoModelForCausalLM.from_pretrained(model_name)
        model.to('cuda' if torch.cuda.is_available() else 'cpu')
    return tokenizer, model

def generate_text(model, tokenizer, prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            do_sample=True,
            temperature=0.7
        )
    end_time = time.time()
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text, end_time - start_time

def run_tests(model_name, prompts):
    print(f"Pruebas para el modelo: {model_name}")

    # Modelo sin cuantizar
    print("Cargando modelo sin cuantizar...")
    tokenizer, model = load_model(model_name)
    memory_usage = get_memory_usage()
    print(f"Uso de memoria (sin cuantizar): {memory_usage:.2f} MB")

    for i, prompt in enumerate(prompts):
        generated_text, generation_time = generate_text(model, tokenizer, prompt)
        print(f"\nPrueba {i+1}")
        print(f"Prompt: {prompt}")
        print(f"Texto generado: {generated_text[:100]}...")
        print(f"Tiempo de generación: {generation_time:.2f} segundos")

    del model
    torch.cuda.empty_cache()

    # Modelo cuantizado
    print("\nCargando modelo cuantizado...")
    tokenizer, model = load_model(model_name, quantized=True)
    memory_usage = get_memory_usage()
    print(f"Uso de memoria (cuantizado): {memory_usage:.2f} MB")

    for i, prompt in enumerate(prompts):
        generated_text, generation_time = generate_text(model, tokenizer, prompt)
        print(f"\nPrueba {i+1}")
        print(f"Prompt: {prompt}")
        print(f"Texto generado: {generated_text[:100]}...")
        print(f"Tiempo de generación: {generation_time:.2f} segundos")

    del model
    torch.cuda.empty_cache()

# Ejecutar pruebas
model_name = "EleutherAI/gpt-j-6B"
prompts = [
    "¿Qué es el sol?",
    "Explica la teoría de la relatividad",
    "Escribe un poema sobre la naturaleza"
]

run_tests(model_name, prompts)

Pruebas para el modelo: EleutherAI/gpt-j-6B
Cargando modelo sin cuantizar...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

Cargando modelo: EleutherAI/gpt-j-6B


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
