In [10]:
import torch
import time
import textwrap
import re

In [2]:
from transformers import TRANSFORMERS_CACHE

print(TRANSFORMERS_CACHE)

C:\Users\zly20\.cache\huggingface\hub


In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
cache_dir = "E:/transformers.cache"  # Specify your desired cache directory
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)

In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",
    llm_int8_enable_fp32_cpu_offload=True
)

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    quantization_config=quantization_config,
    device_map="auto",
    use_cache=True,
    cache_dir=cache_dir
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [19]:
def generate_answer(question):
    # Tokenize input question
    inputs = tokenizer(question, return_tensors="pt").to("cuda")

    # Start timing
    start_time = time.time() 

    # Generate response
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=1024,
        do_sample=True,
        num_beams=1,
    )

    end_time = time.time()  # End timing

    # Count total tokens generated
    num_tokens = generated_ids.shape[1]  # Number of generated tokens

    # Compute tokens per second
    tps = num_tokens / (end_time - start_time)
    print(f"Tokens per second: {tps:.2f}")
    
    # Decode output
    outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    # empty cache so that you do generate more results w/o memory crashing
    # particularly important on Colab – memory management is much more straightforward
    # when running on an inference service
    return outputs

In [26]:
question = """

Consider the multiple linear regression model:

$$Y_i = \beta^T x_i + \epsilon_i$$

for $i = 1, ..., n$, where $\epsilon_i$'s are $iid$ $N(0, \sigma^2)$, $\beta = (\beta_0, \beta_1, ..., \beta_p)^T$ is the vector of regression coefficients, and $x_i = (1, x_{i1}, x_{i2}, ..., x_{ip})^T$ is the vector of predictors for the i-th observation.


## QUESTION 2 (a)

Write down the likelihood function and the log-likelihood function for the multiple linear regression model.


## QUESTION 2 (a) Solution:

Write your solution here in interprtable LaTex style...

"""
answer = generate_answer(question)

Tokens per second: 43.94


In [27]:
print(answer)

["\n\nConsider the multiple linear regression model:\n\n$$Y_i = \x08eta^T x_i + \\epsilon_i$$\n\nfor $i = 1, ..., n$, where $\\epsilon_i$'s are $iid$ $N(0, \\sigma^2)$, $\x08eta = (\x08eta_0, \x08eta_1, ..., \x08eta_p)^T$ is the vector of regression coefficients, and $x_i = (1, x_{i1}, x_{i2}, ..., x_{ip})^T$ is the vector of predictors for the i-th observation.\n\n\n## QUESTION 2 (a)\n\nWrite down the likelihood function and the log-likelihood function for the multiple linear regression model.\n\n\n## QUESTION 2 (a) Solution:\n\nWrite your solution here in interprtable LaTex style...\n\n</think>\n\nTo derive the likelihood function and the log-likelihood function for the multiple linear regression model, we start by considering the probability density function of the normal distribution. Given that the error terms $\\epsilon_i$ are independently and identically distributed (iid) as $N(0, \\sigma^2)$, the probability density function for each $\\epsilon_i$ is:\n\n$$\nf(\\epsilon_i) = \

In [28]:
import sqlparse

In [34]:
prompt = """### Task
Generate a SQL query to answer [QUESTION]{question}[/QUESTION]

### Instructions
- If you cannot answer the question with the available database schema, return 'I do not know'
- Remember that revenue is price multiplied by quantity
- Remember that cost is supply_price multiplied by quantity

### Database Schema
This query will run on a database whose schema is represented in this string:
CREATE TABLE products (
  product_id INTEGER PRIMARY KEY, -- Unique ID for each product
  name VARCHAR(50), -- Name of the product
  price DECIMAL(10,2), -- Price of each unit of the product
  quantity INTEGER  -- Current quantity in stock
);

CREATE TABLE customers (
   customer_id INTEGER PRIMARY KEY, -- Unique ID for each customer
   name VARCHAR(50), -- Name of the customer
   address VARCHAR(100) -- Mailing address of the customer
);

CREATE TABLE salespeople (
  salesperson_id INTEGER PRIMARY KEY, -- Unique ID for each salesperson
  name VARCHAR(50), -- Name of the salesperson
  region VARCHAR(50) -- Geographic sales region
);

CREATE TABLE sales (
  sale_id INTEGER PRIMARY KEY, -- Unique ID for each sale
  product_id INTEGER, -- ID of product sold
  customer_id INTEGER,  -- ID of customer who made purchase
  salesperson_id INTEGER, -- ID of salesperson who made the sale
  sale_date DATE, -- Date the sale occurred
  quantity INTEGER -- Quantity of product sold
);

CREATE TABLE product_suppliers (
  supplier_id INTEGER PRIMARY KEY, -- Unique ID for each supplier
  product_id INTEGER, -- Product ID supplied
  supply_price DECIMAL(10,2) -- Unit price charged by supplier
);

-- sales.product_id can be joined with products.product_id
-- sales.customer_id can be joined with customers.customer_id
-- sales.salesperson_id can be joined with salespeople.salesperson_id
-- product_suppliers.product_id can be joined with products.product_id

### Answer
Given the database schema, here is the SQL query that answers [QUESTION]{question}[/QUESTION]
[SQL]
"""

In [38]:
def generate_query(question):
    updated_prompt = prompt.format(question=question)
    inputs = tokenizer(updated_prompt, return_tensors="pt").to("cuda")

    # Start timing
    start_time = time.time() 

    # Generate response
    generated_ids = model.generate(
        **inputs,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=1024,
        do_sample=True,
        num_beams=1,
    )

    end_time = time.time()  # End timing

    # Count total tokens generated
    num_tokens = generated_ids.shape[1]  # Number of generated tokens

    # Compute tokens per second
    tps = num_tokens / (end_time - start_time)
    print(f"Tokens per second: {tps:.2f}")
    
    outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    # empty cache so that you do generate more results w/o memory crashing
    # particularly important on Colab – memory management is much more straightforward
    # when running on an inference service
    return sqlparse.format(outputs[0].split("[SQL]")[-1], reindent=True)

In [39]:
question = "What was the total number of goods sold in Canada last quarter?"
generated_sql = generate_query(question)

Tokens per second: 64.17


In [40]:
print(generated_sql)


SELECT SUM(s.quantity) AS total_goods_sold
FROM sales s
JOIN products p ON s.product_id = p.product_id
WHERE s.sale_date >= DATEADD(qq, -1, GETDATE())
  AND s.sale_date < DATEADD(qq, 0, GETDATE())
  AND p.name LIKE '%Canada%';

[/SQL] </think> To determine the total number of goods sold in Canada last quarter,
                                                                           we need to query the sales table
and filter the data based on the product name
and the sale date. Here's the step-by-step explanation of the SQL query:

1. **SELECT SUM(s.quantity)**: This part of the query calculates the total number of goods sold by summing up the quantity from the sales table.

2. **AS total_goods_sold**: This aliases the result column for clarity, labeling it as "total_goods_sold".

3. **FROM sales s**: This specifies the table we are querying, which is the "sales" table aliased as "s".

4. **JOIN products p ON s.product_id = p.product_id**: This joins the sales table with the product