In [None]:
# pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir

In [1]:
# pip install llama-cpp-python 
# huggingface-cli download Qwen/Qwen2.5-1.5B-Instruct-GGUF qwen2.5-1.5b-instruct-q5_k_m.gguf --local-dir . --local-dir-use-symlinks False

from llama_cpp import Llama

# Initialize the Llama model
llm = Llama(
    model_path="../LLM-Quantize-Model/qwen2.5-1.5b-instruct-q5_k_m.gguf",  # Ensure model is optimized for CPU
    # n_gpu_layers=None,  # Comment out GPU-related parameters since you're using CPU
    temperature=0.7,  # Lower temperature for less randomness, faster processing
    top_p=0.9,        # More focused sampling (decreases computational complexity)
    n_ctx=32768,       # Reduce context window to save memory and increase speed
    max_tokens=8192,  # Reduce output length for quicker responses
    repeat_penalty=1.1,  # Slightly reduce the penalty for repetition
    stop=['<|im_end|>'],  # Keep the stop token to control output termination
    verbose=False,    # Disable verbose logging for efficiency
)

user_input = ""

while user_input != "exit":
    # Define the user input (prompt)
    print("Enter your query")
    user_input = input()
    print("Query:", user_input)
    # Prepare the message
    messages = [{"role": "system", 
                "content": """You are working as helpful assistant and your job is to understand aim, goal and objective of user query, use chain of thought to break down problem into subproblem, find best optimize solution and while combinting sub-solution check answer relevency with query and give response,

                Instruction:
                    ** Do not try to short the answer, explain as much as you can without thinking token count **

                """},
                {"role": "user", "content": user_input}]

    # Generate a response from the model with streaming enabled
    response = llm.create_chat_completion(
        messages=messages,
        temperature=1.2,
        repeat_penalty=1.178,
        stop=['<|im_end|>'],
        max_tokens=1500,
        stream=True  # Enable streaming
    )

    print("AI response:")
    # Stream the response and print each chunk as it comes
    response_content = ""
    for chunk in response:
        try:
            # Append the content of each chunk to the response
            if "choices" in chunk and "delta" in chunk["choices"][0] and "content" in chunk["choices"][0]["delta"]:
                content = chunk["choices"][0]["delta"]["content"]
                response_content += content
                print(content, end="", flush=True)  # Print the response in one line
        except Exception as e:
            print(f"Error: {e}")
    print("-"*150)  # New line after response

Enter your query
Query: Explain LLMs
AI response:
Large Language Models (LLMs) are artificial intelligence models that can generate human-like responses to a wide range of questions and prompts. These models use deep learning algorithms, such as transformers or neural networks, to analyze vast amounts of text data.
They work by processing large datasets containing language examples, which the model uses to learn patterns in natural language. This enables LLMs to understand context, grammar, syntax, semantics, and more when generating responses.

LLMs are designed with flexibility and adaptability in mind. They can be trained on diverse corpora or tasks that involve understanding human dialogue, such as writing assistant software for writers.
The complexity of the model also allows it to process information from different sources; for example, news articles, social media posts, chat logs - LLMs are capable of generating detailed summaries and reports based on these inputs.--------------

### Vision Models

In [None]:
# pip install llama-cpp-python 
# huggingface-cli download Qwen/Qwen2.5-1.5B-Instruct-GGUF qwen2.5-1.5b-instruct-q5_k_m.gguf --local-dir . --local-dir-use-symlinks False

from llama_cpp import Llama

# Initialize the Llama model
llm = Llama(
    model_path="../LLM-Quantize-Model/Qwen2-VL-2B-Q8_0.gguf",  # Ensure model is optimized for CPU
    # n_gpu_layers=4,  # Specify GPU layers or modify based on your setup
    temperature=1.1,
    top_p=0.5,
    n_ctx=32768,
    max_tokens=1500,
    repeat_penalty=1.178,
    stop=['<|im_end|>'],
    verbose=False
)

# Define the user input (prompt)
user_input = "Explain the importance of AI in education."

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Generate a response from the model
response = llm.create_chat_completion(
    messages=messages,
    temperature=1.2,
    repeat_penalty=1.178,
    stop=['<|im_end|>'],
    max_tokens=1500)

# Extract and print the response in one line
response_content = response['choices'][0]['message']['content']
print(response_content)