In [1]:
!pip install -q -U transformers accelerate torch psutil

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import os

# Create offload directory if it doesn't exist
os.makedirs("offload", exist_ok=True)

Using TensorFlow backend.


In [2]:
# Simplified approach with half-precision and CPU offloading
model_id = "pratham0011/mistral_7b-instruct-research-paper"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load model with memory optimizations
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,  # Use half precision
    low_cpu_mem_usage=True,     # Optimize memory usage
    offload_folder="offload",   # Offload to disk if needed
    device_map="auto"          # Let the library decide the best device mapping
)

# Create a text generation pipeline
pipe = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer,
    max_new_tokens=100,  # Limit output size to avoid memory issues
    do_sample=True,
    temperature=0.7
)

# Simple prompt
prompt = "Who are you?"

# Generate output
print("Generating response...")
output = pipe(prompt)
print(output[0]["generated_text"])

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk.
Device set to use mps


Generating response...
Who are you?
 ##################


###RESEARCH


###TITLE: A Robust and Efficient Algorithm to Detect Arbitrary Arbitrage


###ABORTED_AT TIMESTAMP? 2021-06-21 11:09:19



###RESEARCH



###TITLE: A Robust and Efficient Algorithm to Detect Arbitrage



In [11]:
!pip install -U accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bitsandbytes
  Using cached bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Using cached bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
Installing collected packages: bitsandbytes
  Attempting uninstall: bitsandbytes
    Found existing installation: bitsandbytes 0.41.1
    Uninstalling bitsandbytes-0.41.1:
      Successfully uninstalled bitsandbytes-0.41.1
Successfully installed bitsandbytes-0.42.0


In [None]:
# Check system information
import psutil
import platform

# System info
print(f"Python version: {platform.python_version()}")
print(f"System: {platform.system()} {platform.release()}")
print(f"Processor: {platform.processor()}")

# Memory info
mem = psutil.virtual_memory()
print(f"\nTotal memory: {mem.total / (1024**3):.2f} GB")
print(f"Available memory: {mem.available / (1024**3):.2f} GB")
print(f"Used memory: {mem.used / (1024**3):.2f} GB")
print(f"Memory percent used: {mem.percent}%")

# PyTorch info
print(f"\nPyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"MPS (Apple Silicon) available: {torch.backends.mps.is_available()}")

In [None]:
# Alternative approach with a smaller model
# This might work better if you're experiencing memory issues

# Use a smaller model
small_model_id = "distilbert-base-uncased"

print("Loading a smaller model as an alternative...")
small_tokenizer = AutoTokenizer.from_pretrained(small_model_id)
small_model = AutoModelForCausalLM.from_pretrained(
    small_model_id,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)

small_pipe = pipeline("text-generation", model=small_model, tokenizer=small_tokenizer)
small_output = small_pipe("Hello, how are you?")
print(small_output[0]["generated_text"])