In [1]:
# 使用sys.path添加上级目录
import sys
import os
package_path = os.path.dirname(os.path.dirname(os.getcwd()))
file_path = os.path.join(package_path, "ch07", "01_main-chapter-code")
print(file_path)
sys.path.append(file_path)

import torch
if torch.cuda.is_available():
   device = torch.device("cuda")
elif torch.backends.mps.is_available():
   device = torch.device("mps")
else:
   device = torch.device("cpu")

/Users/young/project/llmProject/LLMs-from-scratch-CN/ch07/01_main-chapter-code


In [2]:
from importlib.metadata import version

pkgs = ["tqdm",    # 进度条
        ]

for p in pkgs:
    print(f"{p} version: {version(p)}")

tqdm version: 4.67.1


# 使用Ollama的REST API

In [4]:
# 格式化输入
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. Write a response that "
        f"appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
    instruction_text + input_text

    return instruction_text + input_text

In [3]:
import json
import urllib.request

def query_model_ollama(prompt, model="llama3.1", url="http://localhost:11434/api/chat"):
    # 创建数据负载作为字典
    data = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "options": {
            "seed": 123,
            "temperature": 0,
        }
    }

    # 将字典转换为 JSON 格式的字符串并编码为字节
    payload = json.dumps(data).encode("utf-8")

    # 创建请求对象，设置方法为 POST 并添加必要的头信息
    request = urllib.request.Request(url, data=payload, method="POST")
    request.add_header("Content-Type", "application/json")

    # 发送请求并捕获响应
    response_data = ""
    with urllib.request.urlopen(request) as response:
        # 读取并解码响应
        while True:
            line = response.readline().decode("utf-8")
            if not line:
                break
            response_json = json.loads(line)
            response_data += response_json["message"]["content"]

    return response_data


result = query_model_ollama("What do Llamas eat?")
print(result)

Llamas are herbivores, which means they primarily eat plants and plant-based foods. Their diet consists of:

1. **Grasses**: They love to graze on various types of grasses, including tall fescue, orchard grass, and bluegrass.
2. **Hay**: Timothy hay, alfalfa hay, and other types of hay are staples in a llama's diet.
3. **Fruits**: Apples, carrots, and sweet potatoes are all treats that llamas enjoy.
4. **Grains**: Oats, corn, and barley can be given to llamas as supplements or treats.
5. **Leafy greens**: Llamas will eat leafy greens like kale, spinach, and collard greens.

In the wild, llamas would typically roam in herds and graze on a variety of plants, including shrubs and trees. In captivity, their diet is often supplemented with commercial llama feed or pellets to ensure they receive all the necessary nutrients.

Some interesting facts about llama eating habits:

* Llamas have a three-part stomach, similar to cows, which allows them to digest plant material more efficiently.
* Th

In [10]:
# 生成数据集
import random
from tqdm import tqdm
import concurrent.futures
import time

def process_single_entry_ollama(args):
    """处理单个数据条目"""
    i, entry = args
    politeness = random.choice(["polite", "impolite"])    
    prompt = (
        f"Given the input `{format_input(entry)}` "
        f"and correct output `{entry['output']}`, "
        f"slightly rewrite the output to be more {politeness}."
        "Keep the modification minimal."
        "Only return the generated response and nothing else."
    )
    
    # 添加重试机制
    max_retries = 3
    retry_delay = 1
    
    for attempt in range(max_retries):
        try:
            response = query_model_ollama(prompt)
            result = {
                "index": i,
                "politeness": politeness,
                "response": response
            }
            return result
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(retry_delay)
                retry_delay *= 2  # 指数退避
            else:
                print(f"处理条目 {i} 失败: {str(e)}")
                return {
                    "index": i,
                    "politeness": None,
                    "response": None
                }

def generate_model_responses_ollama(json_data, max_workers=5):
    """使用并发处理生成模型响应"""
    
    # 准备参数列表
    args_list = [(i, entry) for i, entry in enumerate(json_data)]
    
    # 使用进度条包装结果处理
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_single_entry_ollama, args) for args in args_list]
        
        # 使用tqdm显示进度
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="处理数据"):
            result = future.result()
            if result["politeness"] is not None:
                results.append(result)
    
    # 结果处理
    for result in results:
        i = result["index"]
        politeness = result["politeness"]
        response = result["response"]
        
        if politeness == "polite":
            json_data[i]["chosen"] = response
            json_data[i]["rejected"] = json_data[i]["output"]
        else:
            json_data[i]["rejected"] = response
            json_data[i]["chosen"] = json_data[i]["output"]
    
    # 检查未处理的条目
    processed_indices = set(result["index"] for result in results)
    all_indices = set(range(len(json_data)))
    unprocessed = all_indices - processed_indices
    
    if unprocessed:
        print(f"警告: {len(unprocessed)} 个条目未成功处理")

def generate_model_responses_ollama_orig(json_data):

    for i, entry in enumerate(tqdm(json_data, desc="Writing entries")):
        politeness = random.choice(["polite", "impolite"])    
        prompt = (
            f"Given the input `{format_input(entry)}` "
            f"and correct output `{entry['output']}`, "
            f"slightly rewrite the output to be more {politeness}."
            "Keep the modification minimal."
            "Only return the generated response and nothing else."
        )
        response = query_model_ollama(prompt)
        
        if politeness == "polite":
            json_data[i]["chosen"] = response
            json_data[i]["rejected"] = entry["output"]
        else:
            json_data[i]["rejected"] = response
            json_data[i]["chosen"] = entry["output"]    

In [11]:
from pathlib import Path

json_file = Path("..", "..", "ch07", "01_main-chapter-code", "instruction-data.json")

with open(json_file, "r") as file:
    json_data = json.load(file)

print("Number of entries:", len(json_data))

Number of entries: 1100


In [12]:
generate_model_responses_ollama(json_data)

处理数据: 100%|██████████| 1100/1100 [14:20<00:00,  1.28it/s]


In [13]:
with open("instruction-data-with-preference-ollama.json", "w") as file:
    json.dump(json_data, file, indent=4)

# 使用deepseek构造数据集

In [5]:
from openai import OpenAI
from config_manager import config_manager

config = config_manager.config
api_key = config.get('api', {}).get('deepseek_api_key')
if not api_key:
    raise ValueError("需要提供API密钥")

# 初始化OpenAI客户端来访问Deepseek API
client = OpenAI(
    api_key=api_key,
    base_url=config.get('api', {}).get('deepseek_api_url')
)

In [None]:
import urllib.request
import json

def query_model(prompt, client, config):
    messages = [
        {
            "role": "user",
            "content": prompt
        }
    ]
    # 构建请求数据
    data = {
        "model": config.get('api', {}).get('model', 'deepseek-chat'),
        "messages": messages,
        "temperature": 0.
    }
    # 发送请求
    response = client.chat.completions.create(**data)
    # 解析响应
    try:
        content = response.choices[0].message.content
        # # 清理可能的前导和尾随空白字符
        # content = content.strip()
        
        # # 移除可能的Markdown代码块标记
        # if content.startswith('```json'):
        #     content = content[7:]  # 移除开头的```json
        # if content.endswith('```'):
        #     content = content[:-3]  # 移除结尾的```
        
        # 清理并解析JSON
        content = content.strip()
        response_data = json.loads(content)

        return response_data
        
    except json.JSONDecodeError as e:
        # print(f"JSON解析失败: {content}")
        # print(f"错误信息: {str(e)}")
        return content

result = query_model("What do Llamas eat?", client, config)
print(result)

Llamas are herbivores with a diet primarily consisting of grasses, hay, and other plant materials. Here’s a breakdown of their typical diet:

### **1. Main Food Sources:**
   - **Grasses & Hay:** The bulk of their diet consists of fresh pasture grasses (if available) or high-quality grass hay (such as timothy, orchard, or brome hay).
   - **Forage:** They graze on a variety of plants, including clover and other leafy greens.

### **2. Supplemental Foods:**
   - **Pellets/Grain:** Some llamas are given small amounts of specially formulated llama or alpaca pellets to ensure balanced nutrition, especially in winter or for pregnant/nursing females.
   - **Vegetables & Fruits (in moderation):** Carrots, apples, and leafy greens can be given as treats, but too much sugar (e.g., from fruits) can cause digestive issues.

### **3. Minerals & Salt:**
   - Llamas need access to a **mineral block** or loose mineral mix formulated for camelids (low in copper, as excess copper is toxic to them).
   

## 加载JSON数据

In [7]:
from pathlib import Path

json_file = Path("..", "..", "ch07", "01_main-chapter-code", "instruction-data.json")

with open(json_file, "r") as file:
    json_data = json.load(file)

print("Number of entries:", len(json_data))

Number of entries: 1100


In [8]:
json_data[0]

{'instruction': 'Evaluate the following phrase by transforming it into the spelling given.',
 'input': 'freind --> friend',
 'output': 'The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".'}

In [11]:
# 测试生成数据
import random

for entry in json_data[:5]:
    politeness = random.choice(["polite", "impolite"])
    prompt = (
        f"Given the input `{format_input(entry)}`,"
        f"and correct output `{entry['output']}`, "
        f"slightly rewrite the output to be more {politeness}."
        "Keep the modification minimal."
        "Only return the generated response and nothing else."
    )
    print("\nDataset response:")
    print(">>", entry['output'])
    print(f"\n{politeness} response:")
    print(">>", query_model(prompt, client, config))


Dataset response:
>> The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".

impolite response:
>> The spelling of "freind" is wrong, it's "friend." Learn to spell.

Dataset response:
>> He goes to the park every day.

polite response:
>> He goes to the park every day, thank you.

Dataset response:
>> 45 kilometers is 45000 meters.

polite response:
>> 45 kilometers is equal to 45000 meters.

Dataset response:
>> Although it was raining, they went for a walk.

impolite response:
>> Although it was raining, they still went for a walk, like idiots.

Dataset response:
>> 1, 4, 9, 16, 25, 36, 49, 64, 81, 100.

impolite response:
>> Here, take these stupid square numbers: 1, 4, 9, 16, 25, 36, 49, 64, 81, 100. Ugh.


In [11]:
# 生成数据集
import random
from tqdm import tqdm
import concurrent.futures
import time

def process_single_entry(args):
    """处理单个数据条目"""
    i, entry, client, config = args
    politeness = random.choice(["polite", "impolite"])    
    prompt = (
        f"Given the input `{format_input(entry)}` "
        f"and correct output `{entry['output']}`, "
        f"slightly rewrite the output to be more {politeness}."
        "Keep the modification minimal."
        "Only return the generated response and nothing else."
    )
    
    # 添加重试机制
    max_retries = 3
    retry_delay = 1
    
    for attempt in range(max_retries):
        try:
            response = query_model(prompt, client, config)
            result = {
                "index": i,
                "politeness": politeness,
                "response": response
            }
            return result
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(retry_delay)
                retry_delay *= 2  # 指数退避
            else:
                print(f"处理条目 {i} 失败: {str(e)}")
                return {
                    "index": i,
                    "politeness": None,
                    "response": None
                }

def generate_model_responses(json_data, max_workers=5):
    """使用并发处理生成模型响应"""
    
    # 准备参数列表
    args_list = [(i, entry, client, config) for i, entry in enumerate(json_data)]
    
    # 使用进度条包装结果处理
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_single_entry, args) for args in args_list]
        
        # 使用tqdm显示进度
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="处理数据"):
            result = future.result()
            if result["politeness"] is not None:
                results.append(result)
    
    # 结果处理
    for result in results:
        i = result["index"]
        politeness = result["politeness"]
        response = result["response"]
        
        if politeness == "polite":
            json_data[i]["chosen"] = response
            json_data[i]["rejected"] = json_data[i]["output"]
        else:
            json_data[i]["rejected"] = response
            json_data[i]["chosen"] = json_data[i]["output"]
    
    # 检查未处理的条目
    processed_indices = set(result["index"] for result in results)
    all_indices = set(range(len(json_data)))
    unprocessed = all_indices - processed_indices
    
    if unprocessed:
        print(f"警告: {len(unprocessed)} 个条目未成功处理")

def generate_model_responses_orig(json_data):
    for i, entry in enumerate(tqdm(json_data, desc="Writing entryies")):
        politeness = random.choice(["polite", "impolite"])    
        prompt = (
            f"Given the input `{format_input(entry)}` "
            f"and correct output `{entry['output']}`, "
            f"slightly rewrite the output to be more {politeness}."
            "Keep the modification minimal."
            "Only return the generated response and nothing else."
        )
        response = query_model(prompt, client, config)
        
        if politeness == "polite":
            json_data[i]["chosen"] = response
            json_data[i]["rejected"] = entry["output"]
        else:
            json_data[i]["rejected"] = response
            json_data[i]["chosen"] = entry["output"] 

In [None]:
generate_model_responses(json_data)

处理数据: 100%|██████████| 1100/1100 [1:10:07<00:00,  3.83s/it]


In [None]:
with open("instruction-data-with-preference-deepseek.json", "w") as file:
    json.dump(json_data, file, indent=4)