In [26]:
!pip install pandas



**Generalized behavior and business rules training data**

In [44]:
import pandas as pd
import json
import random
from typing import List, Dict

def load_keywords() -> pd.DataFrame:
    """Load keywords from CSV file"""
    return pd.read_csv('../data/checkpoints/keywords_sales.csv')

def generate_company_info_data() -> List[Dict]:
    """Generate training data for company information"""
    company_responses = [
        "We are Jiangmen Tengxin Motorcycle Technology Co., Ltd., established in 2014. We specialize in new energy motorcycles with over 30 patents and 17 trademarks. Our products are exported globally including Egypt, Pakistan, Mexico, and many other countries.",
        "Our company, based in Jiangmen, China, has been operating since 2014. We focus on research, development, manufacturing, and export of new energy motorcycles.",
        "We're a leading motorcycle manufacturer in Jiangmen, China, with comprehensive capabilities in R&D, production, and global exports of new energy motorcycles."
    ]
    
    company_queries = [
        "Can you tell me about your company?",
        "Who are you?",
        "What's your company background?",
        "Tell me about your business",
        "What does your company do?"
    ]
    
    training_data = []
    for response in company_responses:
        for query in company_queries:
            training_data.append([
                {"role": "system", "content": "Customer is asking about company information"},
                {"role": "user", "content": query},
                {"role": "assistant", "content": response}
            ])
    return training_data

def generate_quality_assurance_data() -> List[Dict]:
    """Generate training data for quality assurance"""
    quality_responses = [
        "We ensure quality through pre-production samples and final inspection before shipment.",
        "Our quality control process includes mandatory pre-production sampling and thorough final inspection.",
        "Quality is guaranteed through our two-step verification: pre-production samples and final inspection."
    ]
    
    quality_queries = [
        "How do you ensure quality?",
        "What's your quality control process?",
        "How can you guarantee product quality?",
        "What quality measures do you have?",
        "Tell me about your quality control"
    ]
    
    training_data = []
    for response in quality_responses:
        for query in quality_queries:
            training_data.append([
                {"role": "system", "content": "Customer is inquiring about quality assurance"},
                {"role": "user", "content": query},
                {"role": "assistant", "content": response}
            ])
    return training_data

def generate_product_inquiry_data(keywords_df: pd.DataFrame) -> List[Dict]:
    """Generate training data for product inquiries"""
    # Get first row which contains product inquiry keywords
    product_keywords = keywords_df.iloc[0,1:].values.flatten()
    product_keywords = [k for k in product_keywords if isinstance(k, str)]
    
    responses = [
        "Could you please specify which type of electric motorcycle or scooter you're interested in?",
        "I'd be happy to provide information about our electric motorcycles and scooters. Which model interests you?", 
        "We offer various electric motorcycle models. Could you tell me which specific type you're looking for?"
    ]
    
    training_data = []
    for keyword in product_keywords:
        for response in responses:
            training_data.append([
                {"role": "system", "content": f"Customer mentioned {keyword}"},
                {"role": "user", "content": ""},
                {"role": "assistant", "content": response}
            ])
    return training_data

def generate_sales_advantage_data() -> List[Dict]:
    """Generate training data for sales advantages"""
    advantage_responses = [
        "Our strength lies in our innovation with over 30 patents and 17 trademarks. We've successfully exported to multiple countries including Egypt, Pakistan, Mexico, and others, consistently receiving positive feedback.",
        "What sets us apart is our strong innovation portfolio and proven track record in international markets. Our products are trusted across multiple countries.",
        "We distinguish ourselves through our innovative technology (30+ patents) and global market presence, serving customers in various countries with consistently high satisfaction."
    ]
    
    advantage_queries = [
        "Why should I choose you?",
        "What makes your company special?",
        "Why are you better than other suppliers?",
        "What are your advantages?",
        "Why should I buy from you?"
    ]
    
    training_data = []
    for response in advantage_responses:
        for query in advantage_queries:
            training_data.append([
                {"role": "system", "content": "Customer is asking about company advantages"},
                {"role": "user", "content": query},
                {"role": "assistant", "content": response}
            ])
    return training_data

def generate_service_information_data() -> List[Dict]:
    """Generate training data for service information"""
    service_responses = [
        "We offer FOB delivery terms and accept multiple payment currencies including USD, EUR, JPY, CAD, AUD, GBP, and CNY. Our payment options include T/T, L/C, D/P, D/A, MoneyGram, and Credit Card.",
        "Our services include FOB shipping, multiple currency acceptance (USD, EUR, JPY, CAD, AUD, GBP, CNY), and various payment methods (T/T, L/C, D/P, D/A, MoneyGram, Credit Card).",
        "We provide comprehensive services including FOB delivery, multiple currency options, and flexible payment methods to suit your needs."
    ]
    
    service_queries = [
        "What services do you provide?",
        "Tell me about your services",
        "What payment methods do you accept?",
        "How can I pay?",
        "What are your shipping terms?"
    ]
    
    training_data = []
    for response in service_responses:
        for query in service_queries:
            training_data.append([
                {"role": "system", "content": "Customer is asking about services"},
                {"role": "user", "content": query},
                {"role": "assistant", "content": response}
            ])
    return training_data


# Load keywords
keywords_df = load_keywords()

# Generate all types of training data
training_data = []
training_data.extend(generate_company_info_data())
training_data.extend(generate_quality_assurance_data())
training_data.extend(generate_product_inquiry_data(keywords_df))
training_data.extend(generate_sales_advantage_data())
training_data.extend(generate_service_information_data())

# Multiply data to reach desired size
while len(training_data) < 2000:
    training_data.extend(training_data)

# Trim to exact size
training_data = training_data[:2000]

# Save to JSON
with open('../data/training/sales_bot_general_training_data.json', 'w', encoding='utf-8') as f:
        json.dump(training_data, f, ensure_ascii=False, indent=2)



**Banking and finance training data**

In [45]:
import json
import random
from typing import List, Dict

def generate_banking_data() -> List[Dict]:
    """Generate training data for banking information"""
    
    # Bank information templates
    bank_info_responses = [
        "Our banking details are:\nBank: China Guangfa Bank\nSWIFT: GDBKCN22XXX\nAccount: 9550880202940900457\nBeneficiary: Jiangmen Tengxin Motorcycle Technology Co., Ltd",
        "Please use the following banking information:\nBank: China Guangfa Bank (SWIFT: GDBKCN22XXX)\nAccount Number: 9550880202940900457\nBeneficiary: Jiangmen Tengxin Motorcycle Technology Co., Ltd",
        "For T/T transfer, please use:\nBeneficiary's Bank: China Guangfa Bank\nSWIFT Code: GDBKCN22XXX\nAccount: 9550880202940900457\nCompany Name: Jiangmen Tengxin Motorcycle Technology Co., Ltd"
    ]
    
    # Various ways customers might ask for banking information
    bank_info_queries = [
        "What's your bank account information?",
        "Can you provide your banking details?",
        "I need your bank details for payment",
        "Please share your bank account details",
        "What's your bank account for wire transfer?",
        "Could you send me your banking information?",
        "Need your bank details for T/T payment",
        "What's your T/T information?",
        "Please provide wire transfer details",
        "Can you share your bank transfer information?"
    ]
    
    # System prompts for context
    system_prompts = [
        "Customer is requesting banking information for payment",
        "Customer needs bank details for wire transfer",
        "Customer is asking about T/T payment details",
        "Customer requires banking information to process payment",
        "Customer is preparing to make a wire transfer"
    ]
    
    # Generate training data with variations
    training_data = []
    
    for response in bank_info_responses:
        for query in bank_info_queries:
            training_data.append([
                {"role": "system", "content": random.choice(system_prompts)},
                {"role": "user", "content": query},
                {"role": "assistant", "content": response}
            ])
    
    # Generate responses for specific banking detail inquiries
    specific_queries = {
        "What's your SWIFT code?": [
            "Our SWIFT code is GDBKCN22XXX",
            "The SWIFT code for our bank is GDBKCN22XXX",
            "GDBKCN22XXX is our bank's SWIFT code"
        ],
        "Which bank do you use?": [
            "We use China Guangfa Bank for international transactions",
            "Our bank is China Guangfa Bank",
            "All our international payments are processed through China Guangfa Bank"
        ],
        "What's your account number?": [
            "Our account number is 9550880202940900457",
            "Please use account number 9550880202940900457 for the transfer",
            "The account number for transfers is 9550880202940900457"
        ],
        "What's the bank's address?": [
            "Our bank's address is: 8th Floor, No.713 Dong Feng East Road, Guangzhou, China",
            "China Guangfa Bank is located at: 8th Floor, No.713 Dong Feng East Road, Guangzhou, China",
            "The bank address is: 8th Floor, No.713 Dong Feng East Road, Guangzhou, China"
        ]
    }
    
    # Add specific query training data
    for query, responses in specific_queries.items():
        for response in responses:
            training_data.append([
                {"role": "system", "content": "Customer is asking for specific banking details"},
                {"role": "user", "content": query},
                {"role": "assistant", "content": response}
            ])
    
    # Generate confirmation and verification responses
    verification_scenarios = [
        {
            "user": "Is this the correct SWIFT code: GDBKCN22XXX?",
            "response": "Yes, that's correct. Our SWIFT code is GDBKCN22XXX."
        },
        {
            "user": "Can you confirm your account number ends in 0457?",
            "response": "Yes, that's correct. Our full account number is 9550880202940900457."
        },
        {
            "user": "Are you with China Guangfa Bank?",
            "response": "Yes, we use China Guangfa Bank for all international transactions."
        }
    ]
    
    for scenario in verification_scenarios:
        training_data.append([
            {"role": "system", "content": "Customer is verifying banking information"},
            {"role": "user", "content": scenario["user"]},
            {"role": "assistant", "content": scenario["response"]}
        ])
    
    return training_data


# Generate banking training data
training_data = generate_banking_data()

# Multiply data to reach desired size while maintaining variety
while len(training_data) < 2000:
    # Add slight variations to existing data
    new_data = []
    for item in training_data:
        # Create variation with slightly modified system prompt
        variation = item.copy()
        variation[0]["content"] = variation[0]["content"].replace(
            "Customer", "Client").replace(
            "requesting", "asking for").replace(
            "needs", "requires")
        new_data.append(variation)
    
    training_data.extend(new_data)

# Trim to exact size
training_data = training_data[:2000]

# Save to JSON
with open('../data/training/banking_training_data.json', 'w', encoding='utf-8') as f:
    json.dump(training_data, f, ensure_ascii=False, indent=2)



**Motorcycle specs training data**

In [28]:
import pandas as pd
import json
import random

# Paths to the CSV files with motorcycle specs
file = '../data/checkpoints/imputed_full_motorcycle_data.csv'

# List to store generated training examples
training_data = []

# Define instruction and input variations
instruction_variations = {
    "top_speed": [
        "How fast does the {model} go?",
        "What is the max speed of the {model}?",
        "Can you tell me the top speed of the {model}?",
        "What’s the highest speed the {model} can reach?",
        "How quick is the {model}?"
    ],
    "fuel_capacity": [
        "What’s the fuel tank size for the {model}?",
        "How big is the fuel tank on the {model}?",
        "How much fuel does the {model} hold?",
        "Tell me the fuel capacity of the {model}.",
        "How large is the fuel tank on the {model}?"
    ],
    "engine_info": [
        "What kind of engine does the {model} have?",
        "Can you tell me about the {model}'s engine?",
        "What engine type is used in the {model}?",
        "Give me details on the {model}'s engine.",
        "Tell me about the engine in the {model}."
    ],
    "key_features": [
        "Share some key features of the {model}.",
        "What are the main features of the {model}?",
        "Tell me the highlights of the {model}.",
        "What makes the {model} stand out?",
        "What are the best things about the {model}?"
    ]
}

input_variations = [
    "",  # Empty input for standard cases
    "Customer is interested in motorcycle performance.",
    "Customer wants to know the main features and performance.",
    "Customer asked about the motorcycle’s speed and handling.",
    "Customer is looking for an overview of the motorcycle.",
    "Customer requested details on fuel and engine specs."
]

# Load each CSV and create training examples
df = pd.read_csv(file)

for _, row in df.iterrows():
    # Extract key details for each motorcycle
    model = row.get('Model Name', 'Unknown Model')
    dimensions = row.get('Dimensions (LxWxH mm)', 'N/A')
    top_speed = row.get('Max Speed (km/h)', 'N/A')
    fuel_capacity = row.get('Fuel Tank Capacity (L)', 'N/A')
    engine_type = row.get('Engine', 'N/A')
    
    # 2. Top speed information
    if not pd.isna(top_speed):
        for instruction in instruction_variations["top_speed"]: 
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The {model} has a top speed of {top_speed}{ ' km/h' if top_speed != 'Not Specified' else ''}"}
            ])
    
    # 3. Fuel capacity information
    if not pd.isna(fuel_capacity):
        for instruction in instruction_variations["fuel_capacity"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The fuel tank on the {model} holds {fuel_capacity}."}
            ])
    
    # 4. Engine information
    if not pd.isna(engine_type):
        for instruction in instruction_variations["engine_info"]:   
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The {model} comes with a {engine_type} engine. "}
            ])

# Save generated training data to JSON
output_file = '../data/training/model_topspeed_fuel_engine_training_data.json'
with open(output_file, 'w') as f:
    json.dump(training_data, f, indent=4)

print(f"Training data saved to {output_file}")


Training data saved to ../data/training/model_topspeed_fuel_engine_training_data.json


In [29]:
import pandas as pd
import json
import random

# Load the CSV file with motorcycle data
file = '../data/checkpoints/imputed_full_motorcycle_data.csv'
df = pd.read_csv(file)

# List to store generated training examples
training_data = []

# Define instruction and input variations for new features
instruction_variations = {
    "price": [
        "What is the price of the {model}?",
        "Tell me the price of the {model}.",
        "How much does the {model} cost?",
        "What’s the cost of the {model}?",
        "Can you give me the price of the {model}?"
    ],
    "brand": [
        "Which brand makes the {model}?",
        "What brand is the {model}?",
        "Tell me about the brand of the {model}.",
        "Who makes the {model}?",
        "What company produces the {model}?"
    ],
    "production_method": [
        "What production method is used for the {model}?",
        "Tell me about the production method of the {model}.",
        "How is the {model} made?",
        "What is the production process for the {model}?",
        "What’s the manufacturing method of the {model}?"
    ],
    "origin": [
        "Where is the {model} made?",
        "What is the origin of the {model}?",
        "Tell me where the {model} is produced.",
        "Where does the {model} come from?",
        "What’s the origin country of the {model}?"
    ],
    "available_colors": [
        "What colors are available for the {model}?",
        "Tell me about the available colors for the {model}.",
        "What color options do I have for the {model}?",
        "What colors can I choose from for the {model}?",
        "Are there different color choices for the {model}?"
    ]
}

input_variations = [
    "",  # Empty input for standard cases
    "Customer is looking for details about the price and brand.",
    "Customer wants to know the origin and production method.",
    "Customer is interested in the available colors of the motorcycle.",
    "Customer asked about the cost and brand of the {model}.",
    "Customer is curious about where the {model} is made."
]

# Load each row from the CSV and generate training examples
for _, row in df.iterrows():
    # Extract the key details
    model = row.get('Model Name', 'Unknown Model')
    price = row.get('Reference Price (Yuan)', 'Not Specified')
    brand = row.get('Brand', 'Not Specified')
    production_method = row.get('Production Method', 'Not Specified')
    origin = row.get('Origin', 'Not Specified')
    available_colors = row.get('Available Colors', 'Not Specified')

    # 1. Price information
    if price != 'Not Specified':    
        for instruction in instruction_variations["price"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"{ '' if price != 'Currently no quotation available' else 'The {model} costs {price} Yuan.'}"}
            ])
        
    # 2. Brand information
    if brand != 'Not Specified':
        for instruction in instruction_variations["brand"]: 
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The {model} is made by {brand}."}
            ])
        
    # 3. Production Method information
    if production_method != 'Not Specified':
        for instruction in instruction_variations["production_method"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The {model} is produced using {production_method} production."}
            ])
        
    # 4. Origin information
    if origin != 'Not Specified':
        for instruction in instruction_variations["origin"]:        
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The {model} has origins in {origin}."}
            ])
        
    # 5. Available colors information
    if available_colors != 'Not Specified':
        for instruction in instruction_variations["available_colors"]:  
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The available colors for the {model} are {available_colors}."}
            ])

# Save generated training data to JSON
output_file = '../data/training/price_brand_production_origin_colors_training_data.json'
with open(output_file, 'w') as f:
    json.dump(training_data, f, indent=4)

output_file  # Return the file path to access the saved file.


'../data/training/price_brand_production_origin_colors_training_data.json'

In [30]:
import pandas as pd
import json
import random

# Load the CSV file with motorcycle data
file = '../data/checkpoints/imputed_full_motorcycle_data.csv'
df = pd.read_csv(file)

# List to store generated training examples
training_data = []

# Define instruction and input variations for new features
instruction_variations = {
    "max_horsepower": [
        "What is the max horsepower of the {model}?",
        "How much horsepower does the {model} have?",
        "Can you tell me about the horsepower of the {model}?",
        "What’s the maximum horsepower for the {model}?",
        "How powerful is the {model} in terms of horsepower?"
    ],
    "max_power_speed": [
        "What is the max power/speed of the {model}?",
        "How much power does the {model} have?",
        "Tell me about the power of the {model}.",
        "What’s the max power output of the {model}?",
        "How strong is the {model} in terms of power?"
    ],
    "transmission": [
        "What kind of transmission does the {model} have?",
        "Can you tell me about the transmission in the {model}?",
        "What type of transmission is used in the {model}?",
        "Tell me the transmission details for the {model}.",
        "What transmission system is used in the {model}?"
    ],
    "abs": [
        "Does the {model} have ABS?",
        "Is the {model} equipped with abs?",
        "Tell me if the {model} has ABS.",
        "Does the {model} feature abs?",
        "Is ABS available in the {model}?"
    ],
    "cbs": [
        "Does the {model} have cbs?",
        "Is the {model} equipped with CBS?",
        "Tell me about CBS in the {model}.",
        "Does the {model} feature CBS?",
        "Is cbs available for the {model}?"
    ],
    "fuel_consumption": [
        "What’s the official fuel consumption of the {model}?",
        "How much fuel does the {model} consume?",
        "Tell me the fuel efficiency of the {model}.",
        "What’s the official fuel consumption rate of the {model}?",
        "How fuel-efficient is the {model}?"
    ],
    "range": [
        "What’s the range of the {model}?",
        "How far can the {model} go on a full tank?",
        "Tell me the range of the {model}.",
        "How many kilometers can the {model} travel?",
        "What’s the maximum range of the {model}?"
    ],
    "seat_height": [
        "What’s the seat height of the {model}?",
        "How tall is the seat on the {model}?",
        "Tell me the seat height for the {model}.",
        "How high is the seat on the {model}?",
        "What’s the seating height of the {model}?"
    ],
    "curb_weight": [
        "What’s the curb weight of the {model}?",
        "How much does the {model} weigh?",
        "Tell me the curb weight for the {model}.",
        "How heavy is the {model}?",
        "What’s the total weight of the {model}?"
    ]
}

input_variations = [
    "",  # Empty input for standard cases
    "Customer is interested in the bike’s performance and power.",
    "Customer wants to know about the transmission and safety features.",
    "Customer asked about the fuel consumption and range.",
    "Customer is looking for details on the weight and seat height."
]

# Load each row from the CSV and generate training examples
for _, row in df.iterrows():
    # Extract key details
    model = row.get('Model Name', 'Unknown Model')
    max_horsepower = row.get('Max Horsepower (Ps)', 'Not Specified')
    max_power_speed = row.get('Max Power/Speed (kW/rpm)', 'Not Specified')
    transmission = row.get('Transmission', 'Not Specified')
    abs_system = row.get('ABS', 'Not Specified')
    cbs_system = row.get('CBS', 'Not Specified')
    fuel_consumption = row.get('Official Average Fuel Consumption (L/100km)', 'Not Specified')
    range_value = row.get('Range (km)', 'Not Specified')
    seat_height = row.get('Seat Height (mm)', 'Not Specified')
    curb_weight = row.get('Curb Weight (kg)', 'Not Specified')

    # 1. Max Horsepower information
    if max_horsepower != 'Not Specified' and max_horsepower != '-':
        for instruction in instruction_variations["max_horsepower"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The {model} has a max horsepower of {max_horsepower}{' Ps' if max_horsepower != 'Not Specified' else ''}"}
            ])
        
    # 2. Max Power/Speed information
    if max_power_speed != 'Not Specified' and max_power_speed != '-':
        for instruction in instruction_variations["max_power_speed"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The {model} has a max power of {max_power_speed}{' kW' if max_power_speed != 'Not Specified' else ''}"}
            ])
        
    # 3. Transmission information
    if transmission != 'Not Specified' and transmission != '-':
        for instruction in instruction_variations["transmission"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The {model} comes with a {transmission}."}
            ])
        
    # 4. ABS information
    if abs_system != 'Not Specified' and abs_system != '-':
        for instruction in instruction_variations["abs"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The {model} is equipped with {abs_system}."}
            ])
        
    # 5. CBS information
    if cbs_system != 'Not Specified' and cbs_system != '-':
        for instruction in instruction_variations["cbs"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The {model} is equipped with {cbs_system}."}
            ])
        
    # 6. Fuel Consumption information
    if fuel_consumption != 'Not Specified' and fuel_consumption != '-':
        for instruction in instruction_variations["fuel_consumption"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The official average fuel consumption for the {model} is {fuel_consumption}{' L/100km' if fuel_consumption != 'Not Specified' else ''}"}
            ])
        
    # 7. Range information
    if range_value != 'Not Specified' and range_value != '-':
        for instruction in instruction_variations["range"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The range of the {model} on a full tank is {range_value}{' km' if range_value != 'Not Specified' else ''}"}
            ])
        
    # 8. Seat Height information
    if seat_height != 'Not Specified' and seat_height != '-':
        for instruction in instruction_variations["seat_height"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The seat height of the {model} is {seat_height}{' mm' if seat_height != 'Not Specified' else ''}"}
            ])
        
    # 9. Curb Weight information
    if curb_weight != 'Not Specified' and curb_weight != '-':
        for instruction in instruction_variations["curb_weight"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The curb weight of the {model} is {curb_weight}{' kg' if curb_weight != 'Not Specified' else ''}"}
            ])

# Save generated training data to JSON
output_file = '../data/training/horsepower_speed_transmission_fuel_height_weight_training_data.json'
with open(output_file, 'w') as f:
    json.dump(training_data, f, indent=4)

output_file  # Return the file path for downloading the generated training data


'../data/training/horsepower_speed_transmission_fuel_height_weight_training_data.json'

In [31]:
import pandas as pd
import json
import random

# Load the CSV file with motorcycle data
csv_path = '../data/checkpoints/full_exhibition_motorcycle_data.csv'
df = pd.read_csv(csv_path)

# List to store generated training examples
training_data = []

# Define instruction and input variations for the features
instruction_variations = {
    "motor_type": [
        "What type of motor does the {model} have?",
        "Tell me about the motor of the {model}.",
        "What kind of motor is used in the {model}?",
        "Is the {model} equipped with a brushless motor?",
        "What is the motor type in the {model}?"
    ],
    "motor_power": [
        "How powerful is the motor in the {model}?",
        "What is the motor power of the {model}?",
        "Tell me the motor power of the {model}.",
        "What’s the power of the motor in the {model}?",
        "How many horsepower does the motor of the {model} have?"
    ],
    "dimensions": [
        "What are the dimensions of the {model}?",
        "Can you tell me the dimensions of the {model}?",
        "What is the size of the {model}?",
        "Tell me the length, width, and height of the {model}.",
        "How big is the {model}?"
    ],
    "seat_height": [
        "What is the seat height of the {model}?",
        "How tall is the seat of the {model}?",
        "Tell me the seat height of the {model}.",
        "How high is the seat on the {model}?",
        "What’s the seat height of the {model}?"
    ],
    "wheelbase": [
        "What is the wheelbase of the {model}?",
        "How long is the wheelbase of the {model}?",
        "Tell me about the wheelbase of the {model}.",
        "How far apart are the wheels on the {model}?",
        "What’s the wheelbase of the {model}?"
    ],
    "tyre_size": [
        "What is the tyre size of the {model}?",
        "Tell me the front and rear tyre sizes of the {model}.",
        "What size tyres are used in the {model}?",
        "What’s the tyre size of the {model}?",
        "Tell me the tyre size for the {model}."
    ],
    "max_speed": [
        "How fast can the {model} go?",
        "What’s the top speed of the {model}?",
        "How fast does the {model} go?",
        "What’s the max speed of the {model}?",
        "What is the maximum speed of the {model}?"
    ],
    "battery_capacity": [
        "What is the battery capacity of the {model}?",
        "How big is the battery in the {model}?",
        "Tell me about the battery capacity of the {model}.",
        "What’s the battery capacity of the {model}?",
        "How much power does the battery of the {model} have?"
    ],
    "charging_time": [
        "How long does it take to charge the {model}?",
        "What’s the charging time for the {model}?",
        "How much time does it take to fully charge the {model}?",
        "Tell me the charging time for the {model}.",
        "What’s the charge time for the {model}?"
    ],
    "brake_system": [
        "What type of brake system does the {model} have?",
        "Tell me about the brakes on the {model}.",
        "What kind of braking system is used in the {model}?",
        "Does the {model} have ABS or CBS?",
        "What’s the brake system like in the {model}?"
    ]
}

input_variations = [
    "",  # Empty input for standard cases
    "Customer asked for performance and safety details.",
    "Customer wants to know about the bike’s features.",
    "Customer is looking for specifications and handling details.",
    "Customer is interested in the design and power of the {model}."
]

# Generate training examples based on the features available in the dataset
for _, row in df.iterrows():
    # Extract key details
    model = row.get('Model Name', 'Unknown Model')
    motor_type = row.get('Motor Type', 'Not Specified')
    motor_power = row.get('Motor Power', 'Not Specified')
    dimensions = row.get('Dimensions (L×W×H)', 'Not Specified')
    seat_height = row.get('Seat Height', 'Not Specified')
    wheelbase = row.get('Wheelbase', 'Not Specified')
    tyre_size = row.get('Front & Rear Tyre Size', 'Not Specified')
    max_speed = row.get('Max Speed', 'Not Specified')
    battery_capacity = row.get('Battery Capacity', 'Not Specified')
    charging_time = row.get('Charging Time', 'Not Specified')
    brake_system = row.get('Brake System', 'Not Specified')
        
    # 3. Motor Power
    if motor_power != 'Not Specified':
        for instruction in instruction_variations["motor_power"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The {model} has a motor power of {motor_power}."}
            ])
        
    # 4. Dimensions
    if dimensions != 'Not Specified':
        for instruction in instruction_variations["dimensions"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The dimensions of the {model} are {dimensions}."}
            ])
        
    # 5. Seat Height
    if seat_height != 'Not Specified':
        for instruction in instruction_variations["seat_height"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The seat height of the {model} is {seat_height}."}
            ])
        
    # 6. Wheelbase
    if wheelbase != 'Not Specified':
        for instruction in instruction_variations["wheelbase"]: 
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The wheelbase of the {model} is {wheelbase}."}
            ])
        
    # 7. Tyre Size
    if tyre_size != 'Not Specified':
        for instruction in instruction_variations["tyre_size"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The front and rear tyres of the {model} are {tyre_size}."}
            ])
        
    # 8. Max Speed
    if max_speed != 'Not Specified':
        for instruction in instruction_variations["max_speed"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The max speed of the {model} is {max_speed}."}
            ])
        
            # 9. Battery Capacity
    if battery_capacity != 'Not Specified':
        for instruction in instruction_variations["battery_capacity"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The battery capacity of the {model} is {battery_capacity}."}
            ])
        
    # 10. Charging Time
    if charging_time != 'Not Specified':
        for instruction in instruction_variations["charging_time"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The charging time for the {model} is {charging_time}."}
            ])
        
    # 11. Brake System
    if brake_system != 'Not Specified':
        for instruction in instruction_variations["brake_system"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The {model} features a {brake_system} brake system."}
            ])

# Save generated training data to JSON
output_file = '../data/training/brochure_exhibition_training_data.json'
with open(output_file, 'w') as f:
    json.dump(training_data, f, indent=4)

output_file  # Return the file path for downloading the generated training data


'../data/training/brochure_exhibition_training_data.json'

In [32]:
# Generate training data for edge cases with missing/unavailable data
edge_case_training_data = []

# Common variations for "no data" responses
no_data_responses = [
    "I apologize, but I don't have any information about {aspect} for these models.",
    "Unfortunately, the {aspect} data is not available in our database.",
    "I cannot provide information about {aspect} as this data is currently missing.",
    "The {aspect} details are not available at the moment.",
    "I don't have access to reliable {aspect} information for these models."
]

# Common input variations
input_variations = [
    "I'm interested in luxury car models.",
    "Tell me about premium vehicles.",
    "I want to learn about high-end cars.",
    "What can you tell me about luxury automobiles?",
    "I'm researching premium car brands."
]

# Generate examples for different missing data scenarios
aspects = [
    "price", "engine capacity", "horsepower", "fuel consumption",
    "acceleration", "top speed", "dimensions", "weight",
    "production dates", "safety ratings", "emission levels"
]

for aspect in aspects:
    edge_case_training_data.append([
        {"role": "system", "content": random.choice(input_variations)}, 
        {"role": "user", "content": f"What is the {aspect} of the latest models?"}, 
        {"role": "assistant", "content": random.choice(no_data_responses).format(aspect=aspect)}
    ])
    
    edge_case_training_data.append([
        {"role": "system", "content": random.choice(input_variations)}, 
        {"role": "user", "content": f"Compare the {aspect} between different models."}, 
        {"role": "assistant", "content": random.choice(no_data_responses).format(aspect=aspect)}
    ])
    
    edge_case_training_data.append([
        {"role": "system", "content": random.choice(input_variations)}, 
        {"role": "user", "content": f"Which models have the highest {aspect}?"}, 
        {"role": "assistant", "content": random.choice(no_data_responses).format(aspect=aspect)}
    ])

# Save edge case training data to JSON
output_file = '../data/training/edge_cases_training_data.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(edge_case_training_data, f, indent=4, ensure_ascii=False)

output_file


'../data/training/edge_cases_training_data.json'

**Generate training data for our brochure CSVs**

In [33]:
import pandas as pd
import json
import random

# Load the CSV file with motorcycle data
file_path = '../data/formatted/brochure_motorcycle.csv'
df = pd.read_csv(file_path)

# Define instruction and input variations for the features
instruction_variations = {
    "dimensions": [
        "What are the dimensions of the {model}?",
        "How big is the {model}?",
        "What's the size of the {model}?",
        "Tell me the length, width, and height of the {model}.",
        "Can you provide the dimensions for the {model}?"
    ],
    "wheelbase": [
        "What's the wheelbase of the {model}?",
        "How long is the wheelbase on the {model}?",
        "Tell me about the wheelbase of the {model}.",
        "What's the distance between wheels on the {model}?",
        "What wheelbase does the {model} have?"
    ],
    "weight": [
        "How much does the {model} weigh?",
        "What's the net weight of the {model}?",
        "Tell me the weight of the {model}.",
        "What's the mass of the {model}?",
        "How heavy is the {model}?"
    ],
    "engine": [
        "What type of engine does the {model} have?",
        "Tell me about the {model}'s engine.",
        "What engine is in the {model}?",
        "What are the engine specs of the {model}?",
        "Can you describe the engine in the {model}?"
    ],
    "max_power": [
        "What's the maximum power of the {model}?",
        "How powerful is the {model}?",
        "What power output does the {model} have?",
        "Tell me about the power specs of the {model}.",
        "What's the power rating of the {model}?"
    ],
    "transmission": [
        "What type of transmission does the {model} use?",
        "How is power transmitted in the {model}?",
        "Tell me about the transmission system of the {model}.",
        "What's the drive system in the {model}?",
        "How does the {model} transfer power to the wheels?"
    ]
}

input_variations = [
    "",  # Empty input for standard cases
    "Customer is interested in technical specifications.",
    "Customer wants to know about the motorcycle's features.",
    "Customer asked about the bike's specifications.",
    "Customer is looking for detailed information.",
    "Customer requested technical details."
]

# Generate training data
training_data = []

for _, row in df.iterrows():
    model = row['Model']
    
    # Generate for each feature if data exists
    if pd.notna(row['Product Dimensions(mm)']):
        for instruction in instruction_variations["dimensions"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The dimensions of the {model} are {row['Product Dimensions(mm)']}."}
            ])
    
    if pd.notna(row['Wheel Base(mm)']):
        for instruction in instruction_variations["wheelbase"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The wheelbase of the {model} is {row['Wheel Base(mm)']}."}
            ])
    
    if pd.notna(row['Net Weight(kg)']):
        for instruction in instruction_variations["weight"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The {model} weighs {row['Net Weight(kg)']} kg."}
            ])
    
    if pd.notna(row['Engine type']):
        for instruction in instruction_variations["engine"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The {model} has a {row['Engine type']} engine."}
            ])
    
    if pd.notna(row['Max. Power(kw/r/min)']):
        for instruction in instruction_variations["max_power"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The maximum power of the {model} is {row['Max. Power(kw/r/min)']}."}
            ])
    
    if pd.notna(row['Transmission mode']):
        for instruction in instruction_variations["transmission"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The {model} uses {row['Transmission mode']} transmission."}
            ])

# Save generated training data to JSON
output_file = '../data/training/brochure_motorcycle_training_data.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(training_data, f, indent=4, ensure_ascii=False)

print(f"Generated {len(training_data)} training examples")
print(f"Saved to: {output_file}")

Generated 55 training examples
Saved to: ../data/training/brochure_motorcycle_training_data.json


In [34]:
import pandas as pd
import json
import random

# Load the CSV file with motorcycle data
csv_path = '../data/formatted/brochure_exhibition.csv'
df = pd.read_csv(csv_path)

# List to store generated training examples
training_data = []

# Define instruction and input variations for the features
instruction_variations = {
    "motor_type": [
        "What type of motor does the {model} have?",
        "Tell me about the motor of the {model}.",
        "What kind of motor is used in the {model}?",
        "Is the {model} equipped with a brushless motor?",
        "What is the motor type in the {model}?"
    ],
    "motor_power": [
        "How powerful is the motor in the {model}?",
        "What is the motor power of the {model}?",
        "Tell me the motor power of the {model}.",
        "What's the power of the motor in the {model}?",
        "How many watts does the motor of the {model} have?"
    ],
    "dimensions": [
        "What are the dimensions of the {model}?",
        "Can you tell me the dimensions of the {model}?",
        "What is the size of the {model}?",
        "Tell me the length, width, and height of the {model}.",
        "How big is the {model}?"
    ],
    "seat_height": [
        "What is the seat height of the {model}?",
        "How tall is the seat of the {model}?",
        "Tell me the seat height of the {model}.",
        "How high is the seat on the {model}?",
        "What's the seat height of the {model}?"
    ],
    "wheelbase": [
        "What is the wheelbase of the {model}?",
        "How long is the wheelbase of the {model}?",
        "Tell me about the wheelbase of the {model}.",
        "How far apart are the wheels on the {model}?",
        "What's the wheelbase of the {model}?"
    ],
    "tyre_size": [
        "What is the tyre size of the {model}?",
        "Tell me the front and rear tyre sizes of the {model}.",
        "What size tyres are used in the {model}?",
        "What's the tyre size of the {model}?",
        "Tell me the tyre size for the {model}."
    ],
    "max_speed": [
        "How fast can the {model} go?",
        "What's the top speed of the {model}?",
        "How fast does the {model} go?",
        "What's the max speed of the {model}?",
        "What is the maximum speed of the {model}?"
    ],
    "battery_capacity": [
        "What is the battery capacity of the {model}?",
        "How big is the battery in the {model}?",
        "Tell me about the battery capacity of the {model}.",
        "What's the battery capacity of the {model}?",
        "How much power does the battery of the {model} have?"
    ],
    "charging_time": [
        "How long does it take to charge the {model}?",
        "What's the charging time for the {model}?",
        "How much time does it take to fully charge the {model}?",
        "Tell me the charging time for the {model}.",
        "What's the charge time for the {model}?"
    ],
    "brake_system": [
        "What type of brake system does the {model} have?",
        "Tell me about the brakes on the {model}.",
        "What kind of braking system is used in the {model}?",
        "Does the {model} have ABS or CBS?",
        "What's the brake system like in the {model}?"
    ]
}

input_variations = [
    "",  # Empty input for standard cases
    "Customer asked for performance and safety details.",
    "Customer wants to know about the bike's features.",
    "Customer is looking for specifications and handling details.",
    "Customer is interested in the design and power of the {model}."
]

# Generate training examples based on the features available in the dataset
for _, row in df.iterrows():
    model = row['Model']
    
    # Motor Type
    if pd.notna(row['Motor Type']):
        for instruction in instruction_variations["motor_type"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The {model} uses a {row['Motor Type']}."}
            ])
    
    # Motor Power
    if pd.notna(row['Motor Power']):
        for instruction in instruction_variations["motor_power"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The {model} has a motor power of {row['Motor Power']}."}
            ])
    
    # Dimensions
    if pd.notna(row['Dimensions (L×W×H)']):
        for instruction in instruction_variations["dimensions"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The dimensions of the {model} are {row['Dimensions (L×W×H)']}."}
            ])
    
    # Seat Height
    if pd.notna(row['Seat Height']):
        for instruction in instruction_variations["seat_height"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The seat height of the {model} is {row['Seat Height']}."}
            ])
    
    # Wheelbase
    if pd.notna(row['Wheelbase']):
        for instruction in instruction_variations["wheelbase"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The wheelbase of the {model} is {row['Wheelbase']}."}
            ])
    
    # Tyre Size
    if pd.notna(row['Front & Rear Tyre Size']):
        for instruction in instruction_variations["tyre_size"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The front and rear tyres of the {model} are {row['Front & Rear Tyre Size']}."}
            ])
    
    # Max Speed
    if pd.notna(row['Max Speed']):
        for instruction in instruction_variations["max_speed"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The max speed of the {model} is {row['Max Speed']}."}
            ])
    
    # Battery Capacity
    if pd.notna(row['Battery Capacity']):
        for instruction in instruction_variations["battery_capacity"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The battery capacity of the {model} is {row['Battery Capacity']}."}
            ])
    
    # Charging Time
    if pd.notna(row['Charging Time']):
        for instruction in instruction_variations["charging_time"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The charging time for the {model} is {row['Charging Time']}."}
            ])
    
    # Brake System
    if pd.notna(row['Brake System']):
        for instruction in instruction_variations["brake_system"]:
            training_data.append([
                {"role": "system", "content": random.choice(input_variations)}, 
                {"role": "user", "content": instruction.format(model=model)}, 
                {"role": "assistant", "content": f"The {model} features a {row['Brake System']} brake system."}
            ])

# Save generated training data to JSON
output_file = '../data/training/brochure_exhibition_training_data.json'
with open(output_file, 'w') as f:
    json.dump(training_data, f, indent=4)

output_file

'../data/training/brochure_exhibition_training_data.json'

**Combine training files**

In [41]:
import json
import glob

# Get all JSON files in the training data directory
json_files = [f for f in glob.glob('../data/training/*.json') if 'combined_training_data.json' not in f]

print("Files to process:", json_files)

# Initialize empty list to store all training data
combined_training_data = []

# Read and combine all JSON files
for file_path in json_files:
    print(f"\nProcessing: {file_path}")
    with open(file_path, 'r') as f:
        data = json.load(f)
        print(f"Data type: {type(data)}")
        print(f"First item format: {type(data[0]) if data else 'empty'}")
        if data:
            print(f"Sample item: {json.dumps(data[0], indent=2)}")
        
        # Only add arrays that contain role-based conversation format
        for item in data:
            if isinstance(item, list) and all(isinstance(msg, dict) and 'role' in msg for msg in item):
                combined_training_data.append({"messages": item})
            else:
                print(f"Skipping item with unexpected format in {file_path}")

print(f"\nTotal number of training examples: {len(combined_training_data)}")

# Save combined data
with open('../data/training_checkpoints/combined_training_data.json', 'w') as f:
    json.dump(combined_training_data, f, indent=4)

Files to process: ['../data/training/model_topspeed_fuel_engine_training_data.json', '../data/training/price_brand_production_origin_colors_training_data.json', '../data/training/edge_cases_training_data.json', '../data/training/horsepower_speed_transmission_fuel_height_weight_training_data.json', '../data/training/brochure_exhibition_training_data.json', '../data/training/sales_terms_training_data.json', '../data/training/brochure_motorcycle_training_data.json']

Processing: ../data/training/model_topspeed_fuel_engine_training_data.json
Data type: <class 'list'>
First item format: <class 'list'>
Sample item: [
  {
    "role": "system",
    "content": ""
  },
  {
    "role": "user",
    "content": "How fast does the \u949b\u6781 NEXY+/\u672a\u754c go?"
  },
  {
    "role": "assistant",
    "content": "The \u949b\u6781 NEXY+/\u672a\u754c has a top speed of 105.0 km/h"
  }
]

Processing: ../data/training/price_brand_production_origin_colors_training_data.json
Data type: <class 'list'>
Fi

**Combine into CSV**

In [36]:
import pandas as pd

# Convert the combined training data to a DataFrame
df = pd.DataFrame(combined_training_data)

# Create checkpoints directory if it doesn't exist
import os
if not os.path.exists('../data/training_checkpoints'):
    os.makedirs('../data/training_checkpoints')

# Save DataFrame to CSV in checkpoints directory
checkpoint_path = '../data/training_checkpoints/training_data.csv'
df.to_csv(checkpoint_path, index=False)

print(f"\nDataFrame saved to: {checkpoint_path}")
print("\nFirst few rows of the DataFrame:")
print(df.head())

print("\nDataFrame Info:")
print(df.info())




DataFrame saved to: ../data/training_checkpoints/training_data.csv

First few rows of the DataFrame:
                                                   0  \
0                  {'role': 'system', 'content': ''}   
1  {'role': 'system', 'content': 'Customer is loo...   
2                  {'role': 'system', 'content': ''}   
3  {'role': 'system', 'content': 'Customer is int...   
4  {'role': 'system', 'content': 'Customer is loo...   

                                                   1  \
0  {'role': 'user', 'content': 'How fast does the...   
1  {'role': 'user', 'content': 'What is the max s...   
2  {'role': 'user', 'content': 'Can you tell me t...   
3  {'role': 'user', 'content': 'What’s the highes...   
4  {'role': 'user', 'content': 'How quick is the ...   

                                                   2  
0  {'role': 'assistant', 'content': 'The 钛极 NEXY+...  
1  {'role': 'assistant', 'content': 'The 钛极 NEXY+...  
2  {'role': 'assistant', 'content': 'The 钛极 NEXY+...  
3  {