微调开源大模型llama3.1-8B

训练集+验证集json格式文件准备

In [2]:
import pandas as pd
import json
import os
import random

# List of file names based on the files you specified
file_names = [
    "Ash + CHNO.csv", "Ash.csv", "CHNO.csv", "grain size.csv", "pH.csv",
    "specific surface area + Ash + CHNO + pH + grain size.csv",
    "specific surface area + pH + grain size.csv", "specific surface area.csv",
    "yield + Ash + CHNO.csv", "yield + specific surface area + Ash + CHNO + pH + grain size.csv",
    "yield + specific surface area + pH + grain size.csv", "yield.csv"
]

# Set the directory path where files are located
data_dir = "训练集+验证集分类"  # Adjust to your dataset directory
train_data = []
validation_data = []

def generate_prediction_statement(file_name):
    properties = {
        "yield": "yield",
        "specific surface area": "specific surface area",
        "Ash": "ash content",
        "CHNO": "chemical composition",
        "pH": "pH",
        "grain size": "grain size"
    }
    predicted_properties = [properties[key] for key in properties if key in file_name]
    if predicted_properties:
        if len(predicted_properties) > 1:
            properties_list = ", ".join(predicted_properties[:-1]) + " and " + predicted_properties[-1]
        else:
            properties_list = predicted_properties[0]
        return f"Based on the above information, infer the biochar's {properties_list}."
    else:
        return "Based on the above information, infer the biochar's properties."

def create_json_data(df, file_name):
    root_entries = []
    for index, row in df.iterrows():
        instruction_base = (
            f"The biomass resource used here is: {row.get('Biomass resources', '')}, sourced from: {row.get('Raw material sources', '')}, "
            f"pre-treatment method is: {row.get('Pre-processing methods', '')}, preparation equipment used is: {row.get('Preparation equipment', '')}, "
            f"other treatments include: {row.get('Other processing', '')}, modification related information: {row.get('modified', '')}, "
            f"the cellulose content by weight percentage in the raw material is: {row.get('Cellulose content', '')}, "
            f"hemicellulose content by weight percentage is: {row.get('Hemicellulose content', '')}, "
            f"lignin content by weight percentage is: {row.get('Lignin content', '')}, "
            f"ash content by weight percentage is: {row.get('Ash content', '')}, "
            f"fixed carbon content by weight percentage is: {row.get('Fixed carbon content', '')}, "
            f"volatile matter content by weight percentage is: {row.get('Volatile matter content', '')}, "
            f"carbon content by weight percentage is: {row.get('Carbon content', '')}, "
            f"hydrogen content by weight percentage is: {row.get('Hydrogen content', '')}, "
            f"nitrogen content by weight percentage is: {row.get('Nitrogen content', '')}, "
            f"oxygen content by weight percentage is: {row.get('Oxygen content', '')}, "
            f"sulfur content by weight percentage is: {row.get('Sulfur content', '')}, "
            f"kalium content by weight percentage is: {row.get('Kalium content', '')}, "
            f"calcium content by weight percentage is: {row.get('Calcium content', '')}, "
            f"sodium content by weight percentage is: {row.get('Natrium content', '')}, "
            f"magnesium content by weight percentage is: {row.get('Magnesium content', '')}, "
            f"ferrum content by weight percentage is: {row.get('Ferrum content', '')}, "
            f"silicon content by weight percentage is: {row.get('Silicon content', '')}, "
            f"the maximum treatment temperature in the pyrolysis experiment is: {row.get('Highest treatment temperature', '')}℃, "
            f"heating rate is: {row.get('Heating rate', '')}℃/min, "
            f"holding time is: {row.get('Residence time', '')} min."
        )
        #prediction_statement = generate_prediction_statement(file_name)
        #instruction = instruction_base + " " + prediction_statement
        instruction = instruction_base
        prediction_statement = generate_prediction_statement(file_name)

        output_parts = []
        if "yield" in file_name:
            output_parts.append(f"yield by weight percentage is {row.get('Biochar yield', '')}")
        if "specific surface area" in file_name:
            output_parts.append(f"specific surface area is {row.get('specific surface area', '')} m²/g")
        if "Ash" in file_name:
            output_parts.append(f"ash content by weight percentage is {row.get('Ash content of the product', '')}")
        if "CHNO" in file_name:
            chno_parts = [
                f"carbon content is {row.get('Carbon content of the product', '')}",
                f"hydrogen content is {row.get('Hydrogen content of the product', '')}",
                f"nitrogen content is {row.get('Nitrogen content of products', '')}",
                f"oxygen content is {row.get('Oxygen content of products', '')}"
            ]
            output_parts.append(", ".join(chno_parts[:-1]) + " and " + chno_parts[-1])
        if "pH" in file_name:
            output_parts.append(f"pH is {row.get('pH', '')}")
        if "grain size" in file_name:
            output_parts.append(f"grain size is {row.get('grain size', '')} mm")

        if output_parts:
            if len(output_parts) == 1:
                output = f"The prepared biochar {output_parts[0]}."
            else:
                output = f"The prepared biochar has {', '.join(output_parts[:-1])} and {output_parts[-1]}."
        else:
            output = "The prepared biochar has no relevant information."

        root_entries.append({
            "instruction": instruction,
            "input": prediction_statement,
            "output": output
        })
    return root_entries

# Read each file and split the data into training and testing sets
for file_name in file_names:
    file_path = os.path.join(data_dir, file_name)
    if os.path.exists(file_path):
        try:
            df = pd.read_csv(file_path)
            data_entries = create_json_data(df, file_name)
            random.shuffle(data_entries)  # Shuffle the entries
            split_index = int(0.7 * len(data_entries))  # Calculate the split index
            train_data.extend(data_entries[:split_index])
            validation_data.extend(data_entries[split_index:])
        except Exception as e:
            print(f"Error reading {file_name}: {e}")
    else:
        print(f"{file_name} does not exist in the directory.")

# Write the training data to a JSON file
train_json_file_path = os.path.join(data_dir, 'train_data2222.json')
with open(train_json_file_path, 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=4)

# Write the validation data to a JSON file
validation_json_file_path = os.path.join(data_dir, 'validation_data2222.json')
with open(validation_json_file_path, 'w', encoding='utf-8') as f:
    json.dump(validation_data, f, ensure_ascii=False, indent=4)

# Print paths of the created JSON files
print(f"Training data JSON file has been generated at: {train_json_file_path}")
print(f"Validation data JSON file has been generated at: {validation_json_file_path}")

Training data JSON file has been generated at: 训练集+验证集分类\train_data2222.json
Validation data JSON file has been generated at: 训练集+验证集分类\validation_data2222.json


测试集json格式文件准备

In [9]:
import pandas as pd
import json
import os

# List of file names based on the files you specified
file_names = [
    "Ash + CHNO.csv", "Ash.csv", "CHNO.csv", "grain size.csv", "pH.csv",
    "specific surface area + Ash + CHNO + pH + grain size.csv",
    "specific surface area + pH + grain size.csv", "specific surface area.csv",
    "yield + Ash + CHNO.csv", "yield + specific surface area + Ash + CHNO + pH + grain size.csv",
    "yield + specific surface area + pH + grain size.csv", "yield.csv"
]

# Set the directory path where files are located
data_dir = "测试集分类"  # Adjust to your dataset directory
test_data = []

def generate_prediction_statement(file_name):
    properties = {
        "yield": "yield",
        "specific surface area": "specific surface area",
        "Ash": "ash content",
        "CHNO": "chemical composition",
        "pH": "pH",
        "grain size": "grain size"
    }
    predicted_properties = [properties[key] for key in properties if key in file_name]
    if predicted_properties:
        if len(predicted_properties) > 1:
            properties_list = ", ".join(predicted_properties[:-1]) + " and " + predicted_properties[-1]
        else:
            properties_list = predicted_properties[0]
        return f"Based on the above information, infer the biochar's {properties_list}."
    else:
        return "Based on the above information, infer the biochar's properties."

def create_json_data(df, file_name):
    root_entries = []
    for index, row in df.iterrows():
        instruction_base = (
            f"The biomass resource used here is: {row.get('Biomass resources', '')}, sourced from: {row.get('Raw material sources', '')}, "
            f"pre-treatment method is: {row.get('Pre-processing methods', '')}, preparation equipment used is: {row.get('Preparation equipment', '')}, "
            f"other treatments include: {row.get('Other processing', '')}, modification related information: {row.get('modified', '')}, "
            f"the cellulose content by weight percentage in the raw material is: {row.get('Cellulose content', '')}, "
            f"hemicellulose content by weight percentage is: {row.get('Hemicellulose content', '')}, "
            f"lignin content by weight percentage is: {row.get('Lignin content', '')}, "
            f"ash content by weight percentage is: {row.get('Ash content', '')}, "
            f"fixed carbon content by weight percentage is: {row.get('Fixed carbon content', '')}, "
            f"volatile matter content by weight percentage is: {row.get('Volatile matter content', '')}, "
            f"carbon content by weight percentage is: {row.get('Carbon content', '')}, "
            f"hydrogen content by weight percentage is: {row.get('Hydrogen content', '')}, "
            f"nitrogen content by weight percentage is: {row.get('Nitrogen content', '')}, "
            f"oxygen content by weight percentage is: {row.get('Oxygen content', '')}, "
            f"sulfur content by weight percentage is: {row.get('Sulfur content', '')}, "
            f"kalium content by weight percentage is: {row.get('Kalium content', '')}, "
            f"calcium content by weight percentage is: {row.get('Calcium content', '')}, "
            f"sodium content by weight percentage is: {row.get('Natrium content', '')}, "
            f"magnesium content by weight percentage is: {row.get('Magnesium content', '')}, "
            f"ferrum content by weight percentage is: {row.get('Ferrum content', '')}, "
            f"silicon content by weight percentage is: {row.get('Silicon content', '')}, "
            f"the maximum treatment temperature in the pyrolysis experiment is: {row.get('Highest treatment temperature', '')}℃, "
            f"heating rate is: {row.get('Heating rate', '')}℃/min, "
            f"holding time is: {row.get('Residence time', '')} min."
        )
        prediction_statement = generate_prediction_statement(file_name)
        instruction = instruction_base + " " + prediction_statement

        output_parts = []
        if "yield" in file_name:
            output_parts.append(f"yield by weight percentage is {row.get('Biochar yield', '')}")
        if "specific surface area" in file_name:
            output_parts.append(f"specific surface area is {row.get('specific surface area', '')} m²/g")
        if "Ash" in file_name:
            output_parts.append(f"ash content by weight percentage is {row.get('Ash content of the product', '')}")
        if "CHNO" in file_name:
            chno_parts = [
                f"carbon content is {row.get('Carbon content of the product', '')}",
                f"hydrogen content is {row.get('Hydrogen content of the product', '')}",
                f"nitrogen content is {row.get('Nitrogen content of products', '')}",
                f"oxygen content is {row.get('Oxygen content of products', '')}"
            ]
            output_parts.append(", ".join(chno_parts[:-1]) + " and " + chno_parts[-1])
        if "pH" in file_name:
            output_parts.append(f"pH is {row.get('pH', '')}")
        if "grain size" in file_name:
            output_parts.append(f"grain size is {row.get('grain size', '')} mm")

        if output_parts:
            if len(output_parts) == 1:
                output = f"The prepared biochar {output_parts[0]}."
            else:
                output = f"The prepared biochar has {', '.join(output_parts[:-1])} and {output_parts[-1]}."
        else:
            output = "The prepared biochar has no relevant information."

        root_entries.append({
            "instruction": instruction,
            "input": "",
            "output": output
        })
    return root_entries

# Read each file and gather all data into a single dataset
for file_name in file_names:
    file_path = os.path.join(data_dir, file_name)
    if os.path.exists(file_path):
        try:
            df = pd.read_csv(file_path)
            data_entries = create_json_data(df, file_name)
            test_data.extend(data_entries)  # Add all entries to the test dataset
        except Exception as e:
            print(f"Error reading {file_name}: {e}")
    else:
        print(f"{file_name} does not exist in the directory.")

# Write the complete test data to a JSON file
test_json_file_path = os.path.join(data_dir, 'test_data.json')
with open(test_json_file_path, 'w', encoding='utf-8') as f:
    json.dump(test_data, f, ensure_ascii=False, indent=4)

# Print path of the created JSON file
print(f"Test data JSON file has been generated at: {test_json_file_path}")

Test data JSON file has been generated at: 测试集分类\test_data.json
