In [None]:
pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==1

In [None]:
from sklearn.metrics import accuracy_score
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Cargar el modelo original
model_name = "EleutherAI/gpt-neo-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
original_model = AutoModelForCausalLM.from_pretrained(model_name)
original_model.eval()

# Cuantizar el modelo
quantized_model = torch.quantization.quantize_dynamic(
    original_model, {torch.nn.Linear}, dtype=torch.qint8
)
quantized_model.eval()

# Datos de prueba (ejemplo simplificado)
test_prompts = [
    "¿Qué es Python?",
    "¿Para qué se usa Docker?",
    "Explica el machine learning."
]
expected_responses = [
    "Python es un lenguaje de programación.",
    "Docker se usa para contenedorización.",
    "El machine learning es una rama de la inteligencia artificial."
]

# Evaluar el modelo original
original_responses = []
for prompt in test_prompts:
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = original_model.generate(input_ids, max_length=50)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    original_responses.append(response)

# Evaluar el modelo cuantizado
quantized_responses = []
for prompt in test_prompts:
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = quantized_model.generate(input_ids, max_length=50)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    quantized_responses.append(response)

# Calcular la exactitud (ejemplo simplificado)
def evaluate_accuracy(responses, expected):
    correct = sum([1 if r in e else 0 for r, e in zip(responses, expected)])
    return correct / len(expected)

original_accuracy = evaluate_accuracy(original_responses, expected_responses)
quantized_accuracy = evaluate_accuracy(quantized_responses, expected_responses)

print(f"Exactitud del modelo original: {original_accuracy:.2f}")
print(f"Exactitud del modelo cuantizado: {quantized_accuracy:.2f}")

# Comparar la pérdida de precisión
precision_loss = original_accuracy - quantized_accuracy
print(f"Pérdida de precisión: {precision_loss:.2f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]