In [1]:
api_token = "Your Token Here" 

In [2]:
import requests


In [None]:
model = "gpt2"  # You can replace this with any other model available on Hugging Face

# Hugging Face Inference API URL
api_url = f"https://api-inference.huggingface.co/models/{model}"

# Headers for authorization
headers = {
    "Authorization": f"Bearer {api_token}"
}

# Your prompt
prompt = "Once upon a time in a faraway land,"

# Payload to send to the API
payload = {
    "inputs": prompt,
    "parameters": {
        "max_length": 50,  # Adjust the max length as needed
    }
}

# Make the request to the API
response = requests.post(api_url, headers=headers, json=payload)

# Print the response
if response.status_code == 200:
    print(response.json())
else:
    print(f"Error: {response.status_code}, {response.text}")

# Parameters:

## System Prompt

What: A preamble or context-setting input to guide the model's behavior.

Why: Helps in defining the role of the assistant, e.g., "You are a friendly chatbot."

## User Prompt

What: The actual question or task input provided by the user.

Why: This is the primary input the model will respond to.

## Max Length

What: The maximum number of tokens (words or subwords) in the model's response.

Why: Limits the length of the response to save costs and focus on concise answers.

## Temperature

What: Controls the randomness of the model's output.

Lower values (e.g., 0.2) = deterministic and focused responses.

Higher values (e.g., 0.8) = more creative and diverse responses.


## Top-p (Nucleus Sampling)

What: Probability threshold for selecting tokens. The model picks from the smallest set of tokens whose probabilities add up to p.

Why: Offers better control over randomness without limiting diversity as much as top-k.

## Top-k Sampling

What: Limits sampling to the k most likely next tokens.

Why: Reduces randomness by focusing on the most likely options.

## Repetition Penalty

What: Penalizes the model for repeating the same phrases.

Why: Prevents repetitive or verbose outputs.

## Stop Sequences

What: Strings or tokens that stop the modelâ€™s response when encountered.

Why: Ensures responses end appropriately (e.g., after completing a sentence or phrase).




In [2]:
class HuggingFaceWrapper:
    def __init__(self, api_token):
        self.api_token = api_token

    def query(self, model, prompt, max_length=100, temperature=0.7, top_p=0.9, 
              top_k=50, repetition_penalty=1.2, stop_sequences=None, debug=False):
        api_url = f"https://api-inference.huggingface.co/models/{model}"
        headers = {"Authorization": f"Bearer {self.api_token}"}
        payload = {
            "inputs": prompt,
            "parameters": {
                "max_length": max_length,
                "temperature": temperature,
                "top_p": top_p,
                "top_k": top_k,
                "repetition_penalty": repetition_penalty,
                "stop_sequences": stop_sequences,
            }
        }
        
        if debug:
            print(f"Payload: {payload}")
        
        response = requests.post(api_url, headers=headers, json=payload)
        
        if response.status_code == 200:
            return response.json()
        else:
            return {"error": response.text}

In [None]:
hf = HuggingFaceWrapper(api_token)

# Query the model with all customizations
response = hf.query(
    model="gpt2",
    prompt="Once upon a time in a magical forest,",
    max_length=50,
    temperature=0.8,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1,
    stop_sequences=["\n"],
    debug=True
)