In [1]:
!pip install pandas



In [2]:
import pandas as pd
import json
import random

# Load the new keywords CSV file
file_path = '../data/checkpoints/keywords_sales.csv'  # Replace with the actual path to your CSV file
keywords_df = pd.read_csv(file_path)

# Define multiple response templates and varied instructions for each category
response_templates = {
    "Payment Method": [
        "Could you tell me which payment method you would like to use? We support T/T, L/C, D/P, D/A, MoneyGram, and Credit Card.",
        "What payment option works best for you? We accept T/T, L/C, D/P, D/A, MoneyGram, and Credit Card.",
        "Please let me know your preferred payment method. We offer T/T, L/C, D/P, D/A, MoneyGram, and Credit Card.",
        "Could you share your preferred payment option? We can accept T/T, L/C, D/P, D/A, MoneyGram, and Credit Card.",
        "For payment, would T/T, L/C, D/P, D/A, MoneyGram, or Credit Card work best for you?",
        "To proceed with your order, which payment method suits you best? We have options like T/T, L/C, D/P, D/A, MoneyGram, and Credit Card."
    ],
    "Freight Forwarder": [
        "Could you provide your company and forwarder’s contact information to facilitate shipping?",
        "To ensure timely delivery, may we have your forwarder’s contact information?",
        "Please share the contact details for your freight forwarder to assist with coordination.",
        "To help arrange your shipment, could you provide your forwarder’s contact information?",
        "Could you confirm your forwarder’s details so we can coordinate shipping?",
        "For shipment, could you provide us with your forwarding agent’s contact details?"
    ],
    "Delivery Inquiry": [
        "Could you tell us your preferred delivery date or timing? We’ll align production and shipping accordingly.",
        "When would be an ideal time for delivery? This helps us plan our schedule.",
        "Please share the timing you have in mind for delivery, and we’ll coordinate to meet your needs.",
        "Could you specify when you’d like the items to be delivered? We’ll handle the arrangements.",
        "What timeline works best for you for delivery? Let us know so we can coordinate.",
        "For scheduling, could you let us know the delivery timing you have in mind?"
    ],
    "Specification Inquiry": [
        "The model’s dimensions are [dimensions]. Do you need additional specifications?",
        "This model is sized at [dimensions]. Let us know if you’d like further details.",
        "Would you like more information beyond the dimensions ([dimensions])?",
        "The size of this model is [dimensions]. Are there other specifications you need?",
        "Our model measures [dimensions]. Let us know if you’d like more details.",
        "The dimensions for this model are [dimensions]. Would you like further specifics?"
    ]
}

# Define even more varied instructions and inputs for each category
instruction_templates = {
    "Payment Method": [
        "Ask the customer about their payment method preferences if they mention payment.",
        "Respond with our supported payment methods if the customer asks about payment options.",
        "Guide the customer on available payment methods when they bring up payment.",
        "Mention payment options if the customer expresses interest in payment methods.",
        "Provide payment method details if the customer mentions paying or payment methods."
    ],
    "Freight Forwarder": [
        "Request forwarding contact information if the customer brings up shipping.",
        "Ask for the customer’s freight forwarder details when they inquire about shipping.",
        "When freight forwarding is mentioned, prompt the customer for forwarding details.",
        "Inquire about the freight forwarder if the customer discusses shipping needs.",
        "Request the forwarder’s contact info if the customer talks about arranging freight."
    ],
    "Delivery Inquiry": [
        "Ask for the delivery timing if the customer mentions delivery or timing.",
        "Request delivery preferences when the customer discusses timing or delivery.",
        "Inquire about preferred delivery date if the customer brings up timing.",
        "Ask for delivery details when the customer inquires about delivery timing.",
        "When timing is mentioned, ask for the customer’s expected delivery schedule."
    ],
    "Specification Inquiry": [
        "Provide the model’s dimensions when the customer asks about specifications.",
        "Respond with model size if the customer inquires about product specifications.",
        "Mention product dimensions when the customer requests specifications.",
        "When the customer asks about size, provide model dimensions and ask if they need more.",
        "Offer the dimensions of the model if specifications are requested."
    ]
}

# Generate JSON structure with maximum variety and careful exclusion of specific years
training_data = []

# Iterate through each row in the DataFrame to build diverse training examples
for _, row in keywords_df.iterrows():
    category = row.iloc[0]  # First column is the category
    keywords = ', '.join([str(keyword).strip() for keyword in row[1:] if pd.notna(keyword)])  # Combine all keyword columns
    
    # Retrieve multiple response templates and instruction variations for the current category
    responses = response_templates.get(category, ["Please provide additional details."])
    instructions = instruction_templates.get(category, ["Respond appropriately based on customer inquiry."])
    
    # Create multiple entries by pairing instruction variations with response variations
    for response in responses:
        for instruction in instructions:
            # Randomize inputs to add subtle variability in how the customer query is described
            input_variations = [
                f"The customer mentioned {keywords}.",
                f"The customer asked about {category.lower()}.",
                f"The customer inquired regarding {keywords}.",
                f"The customer brought up {category.lower()} options.",
                f"Keywords such as {keywords} were mentioned by the customer."
            ]
            input_text = random.choice(input_variations)

            training_data.append({
                "instruction": instruction,
                "input": input_text,
                "output": response.replace("[dimensions]", "1960×820×1220 mm")  # Replace placeholder if necessary
            })

# Save to JSON file
json_output_path = '../data/training/sales_terms_training_data.json'
with open(json_output_path, 'w') as f:
    json.dump(training_data, f, indent=4)

print("Training data has been saved to 'generalized_sales_bot_training_data.json'")


Training data has been saved to 'generalized_sales_bot_training_data.json'


In [5]:
import pandas as pd
import json
import random

# Paths to the CSV files with motorcycle specs
file = '../data/checkpoints/imputed_full_motorcycle_data.csv'

# List to store generated training examples
training_data = []

# Define instruction and input variations
instruction_variations = {
    "model_info": [
        "Tell me about the {model}.",
        "Can you give me details about the {model}?",
        "What can you tell me about the {model}?",
        "I’d like to know more about the {model}.",
        "Give me an overview of the {model}."
    ],
    "top_speed": [
        "How fast does the {model} go?",
        "What is the max speed of the {model}?",
        "Can you tell me the top speed of the {model}?",
        "What’s the highest speed the {model} can reach?",
        "How quick is the {model}?"
    ],
    "fuel_capacity": [
        "What’s the fuel tank size for the {model}?",
        "How big is the fuel tank on the {model}?",
        "How much fuel does the {model} hold?",
        "Tell me the fuel capacity of the {model}.",
        "How large is the fuel tank on the {model}?"
    ],
    "engine_info": [
        "What kind of engine does the {model} have?",
        "Can you tell me about the {model}'s engine?",
        "What engine type is used in the {model}?",
        "Give me details on the {model}'s engine.",
        "Tell me about the engine in the {model}."
    ],
    "key_features": [
        "Share some key features of the {model}.",
        "What are the main features of the {model}?",
        "Tell me the highlights of the {model}.",
        "What makes the {model} stand out?",
        "What are the best things about the {model}?"
    ]
}

input_variations = [
    "",  # Empty input for standard cases
    "Customer is interested in motorcycle performance.",
    "Customer wants to know the main features and performance.",
    "Customer asked about the motorcycle’s speed and handling.",
    "Customer is looking for an overview of the motorcycle.",
    "Customer requested details on fuel and engine specs."
]

# Load each CSV and create training examples
df = pd.read_csv(file)

for _, row in df.iterrows():
    # Extract key details for each motorcycle
    model = row.get('Model Name', 'Unknown Model')
    dimensions = row.get('Dimensions (LxWxH mm)', 'N/A')
    top_speed = row.get('Max Speed (km/h)', 'N/A')
    fuel_capacity = row.get('Fuel Tank Capacity (L)', 'N/A')
    engine_type = row.get('Engine', 'N/A')
    
    # Generate examples with instruction and input variations
    # 1. Basic model information
    if not any(pd.isna([dimensions, top_speed])):
        training_data.append({
            "instruction": random.choice(instruction_variations["model_info"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The {model} measures {dimensions}. It can go up to {top_speed} km/h."
        })
    
    # 2. Top speed information
    if not pd.isna(top_speed):
        training_data.append({
            "instruction": random.choice(instruction_variations["top_speed"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The {model} can reach a top speed of {top_speed} km/h."
        })
    
    # 3. Fuel capacity information
    if not pd.isna(fuel_capacity):
        training_data.append({
            "instruction": random.choice(instruction_variations["fuel_capacity"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The fuel tank on the {model} holds {fuel_capacity} liters."
        })
    
    # 4. Engine information
    if not pd.isna(engine_type):
        training_data.append({
            "instruction": random.choice(instruction_variations["engine_info"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The {model} comes with a {engine_type} engine. "
        })
    
    # 5. Key features information
    if not any(pd.isna([top_speed, dimensions])):
        training_data.append({
            "instruction": random.choice(instruction_variations["key_features"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The {model} has a top speed of {top_speed} km/h. It's dimensions are {dimensions}."
        })

# Save generated training data to JSON
output_file = '../data/training/model_topspeed_fuel_engine_training_data.json'
with open(output_file, 'w') as f:
    json.dump(training_data, f, indent=4)

print(f"Training data saved to {output_file}")


Training data saved to ../data/training/model_topspeed_fuel_engine_training_data.json


In [6]:
import pandas as pd
import json
import random

# Load the CSV file with motorcycle data
file = '../data/checkpoints/imputed_full_motorcycle_data.csv'
df = pd.read_csv(file)

# List to store generated training examples
training_data = []

# Define instruction and input variations for new features
instruction_variations = {
    "price": [
        "What is the price of the {model}?",
        "Tell me the price of the {model}.",
        "How much does the {model} cost?",
        "What’s the cost of the {model}?",
        "Can you give me the price of the {model}?"
    ],
    "brand": [
        "Which brand makes the {model}?",
        "What brand is the {model}?",
        "Tell me about the brand of the {model}.",
        "Who makes the {model}?",
        "What company produces the {model}?"
    ],
    "production_method": [
        "What production method is used for the {model}?",
        "Tell me about the production method of the {model}.",
        "How is the {model} made?",
        "What is the production process for the {model}?",
        "What’s the manufacturing method of the {model}?"
    ],
    "origin": [
        "Where is the {model} made?",
        "What is the origin of the {model}?",
        "Tell me where the {model} is produced.",
        "Where does the {model} come from?",
        "What’s the origin country of the {model}?"
    ],
    "available_colors": [
        "What colors are available for the {model}?",
        "Tell me about the available colors for the {model}.",
        "What color options do I have for the {model}?",
        "What colors can I choose from for the {model}?",
        "Are there different color choices for the {model}?"
    ]
}

input_variations = [
    "",  # Empty input for standard cases
    "Customer is looking for details about the price and brand.",
    "Customer wants to know the origin and production method.",
    "Customer is interested in the available colors of the motorcycle.",
    "Customer asked about the cost and brand of the {model}.",
    "Customer is curious about where the {model} is made."
]

# Load each row from the CSV and generate training examples
for _, row in df.iterrows():
    # Extract the key details
    model = row.get('Model Name', 'Unknown Model')
    price = row.get('Reference Price (Yuan)', 'Not Specified')
    brand = row.get('Brand', 'Not Specified')
    production_method = row.get('Production Method', 'Not Specified')
    origin = row.get('Origin', 'Not Specified')
    available_colors = row.get('Available Colors', 'Not Specified')

    # 1. Price information
    if price != 'Not Specified':
        training_data.append({
            "instruction": random.choice(instruction_variations["price"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The {model} costs {price} Yuan."
        })
        
    # 2. Brand information
    if brand != 'Not Specified':
        training_data.append({
            "instruction": random.choice(instruction_variations["brand"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The {model} is made by {brand}."
        })
        
    # 3. Production Method information
    if production_method != 'Not Specified':
        training_data.append({
            "instruction": random.choice(instruction_variations["production_method"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The {model} is produced using {production_method}."
        })
        
    # 4. Origin information
    if origin != 'Not Specified':
        training_data.append({
            "instruction": random.choice(instruction_variations["origin"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The {model} is made in {origin}."
        })
        
    # 5. Available colors information
    if available_colors != 'Not Specified':
        training_data.append({
            "instruction": random.choice(instruction_variations["available_colors"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The available colors for the {model} are {available_colors}."
        })

# Save generated training data to JSON
output_file = '../data/training/price_brand_production_origin_colors_training_data.json'
with open(output_file, 'w') as f:
    json.dump(training_data, f, indent=4)

output_file  # Return the file path to access the saved file.


'../data/training/price_brand_production_origin_colors_training_data.json'

In [7]:
import pandas as pd
import json
import random

# Load the CSV file with motorcycle data
file = '../data/checkpoints/imputed_full_motorcycle_data.csv'
df = pd.read_csv(file)

# List to store generated training examples
training_data = []

# Define instruction and input variations for new features
instruction_variations = {
    "max_horsepower": [
        "What is the max horsepower of the {model}?",
        "How much horsepower does the {model} have?",
        "Can you tell me about the horsepower of the {model}?",
        "What’s the maximum horsepower for the {model}?",
        "How powerful is the {model} in terms of horsepower?"
    ],
    "max_power_speed": [
        "What is the max power/speed of the {model}?",
        "How much power does the {model} have?",
        "Tell me about the power of the {model}.",
        "What’s the max power output of the {model}?",
        "How strong is the {model} in terms of power?"
    ],
    "transmission": [
        "What kind of transmission does the {model} have?",
        "Can you tell me about the transmission in the {model}?",
        "What type of transmission is used in the {model}?",
        "Tell me the transmission details for the {model}.",
        "What transmission system is used in the {model}?"
    ],
    "abs": [
        "Does the {model} have ABS?",
        "Is the {model} equipped with ABS?",
        "Tell me if the {model} has ABS.",
        "Does the {model} feature ABS?",
        "Is ABS available in the {model}?"
    ],
    "cbs": [
        "Does the {model} have CBS?",
        "Is the {model} equipped with CBS?",
        "Tell me about CBS in the {model}.",
        "Does the {model} feature CBS?",
        "Is CBS available for the {model}?"
    ],
    "fuel_consumption": [
        "What’s the official fuel consumption of the {model}?",
        "How much fuel does the {model} consume per 100 km?",
        "Tell me the fuel efficiency of the {model}.",
        "What’s the official fuel consumption rate of the {model}?",
        "How fuel-efficient is the {model}?"
    ],
    "range": [
        "What’s the range of the {model}?",
        "How far can the {model} go on a full tank?",
        "Tell me the range of the {model}.",
        "How many kilometers can the {model} travel?",
        "What’s the maximum range of the {model}?"
    ],
    "seat_height": [
        "What’s the seat height of the {model}?",
        "How tall is the seat on the {model}?",
        "Tell me the seat height for the {model}.",
        "How high is the seat on the {model}?",
        "What’s the seating height of the {model}?"
    ],
    "curb_weight": [
        "What’s the curb weight of the {model}?",
        "How much does the {model} weigh?",
        "Tell me the curb weight for the {model}.",
        "How heavy is the {model}?",
        "What’s the total weight of the {model}?"
    ]
}

input_variations = [
    "",  # Empty input for standard cases
    "Customer is interested in the bike’s performance and power.",
    "Customer wants to know about the transmission and safety features.",
    "Customer asked about the fuel consumption and range.",
    "Customer is looking for details on the weight and seat height."
]

# Load each row from the CSV and generate training examples
for _, row in df.iterrows():
    # Extract key details
    model = row.get('Model Name', 'Unknown Model')
    max_horsepower = row.get('Max Horsepower (Ps)', 'Not Specified')
    max_power_speed = row.get('Max Power/Speed (kW/rpm)', 'Not Specified')
    transmission = row.get('Transmission', 'Not Specified')
    abs_system = row.get('ABS', 'Not Specified')
    cbs_system = row.get('CBS', 'Not Specified')
    fuel_consumption = row.get('Official Average Fuel Consumption (L/100km)', 'Not Specified')
    range_value = row.get('Range (km)', 'Not Specified')
    seat_height = row.get('Seat Height (mm)', 'Not Specified')
    curb_weight = row.get('Curb Weight (kg)', 'Not Specified')

    # 1. Max Horsepower information
    if max_horsepower != 'Not Specified' and max_horsepower != '-':
        training_data.append({
            "instruction": random.choice(instruction_variations["max_horsepower"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The {model} has a max horsepower of {max_horsepower} Ps."
        })
        
    # 2. Max Power/Speed information
    if max_power_speed != 'Not Specified' and max_power_speed != '-':
        training_data.append({
            "instruction": random.choice(instruction_variations["max_power_speed"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The {model} has a max power of {max_power_speed} kW at {row.get('Max Power/Speed (kW/rpm)', 'N/A')} rpm."
        })
        
    # 3. Transmission information
    if transmission != 'Not Specified' and transmission != '-':
        training_data.append({
            "instruction": random.choice(instruction_variations["transmission"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The {model} comes with a {transmission} transmission."
        })
        
    # 4. ABS information
    if abs_system != 'Not Specified' and abs_system != '-':
        training_data.append({
            "instruction": random.choice(instruction_variations["abs"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The {model} is equipped with {abs_system}."
        })
        
    # 5. CBS information
    if cbs_system != 'Not Specified' and cbs_system != '-':
        training_data.append({
            "instruction": random.choice(instruction_variations["cbs"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The {model} is equipped with {cbs_system}."
        })
        
    # 6. Fuel Consumption information
    if fuel_consumption != 'Not Specified' and fuel_consumption != '-':
        training_data.append({
            "instruction": random.choice(instruction_variations["fuel_consumption"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The official average fuel consumption for the {model} is {fuel_consumption} L/100km."
        })
        
    # 7. Range information
    if range_value != 'Not Specified' and range_value != '-':
        training_data.append({
            "instruction": random.choice(instruction_variations["range"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The range of the {model} on a full tank is {range_value} km."
        })
        
    # 8. Seat Height information
    if seat_height != 'Not Specified' and seat_height != '-':
        training_data.append({
            "instruction": random.choice(instruction_variations["seat_height"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The seat height of the {model} is {seat_height} mm."
        })
        
    # 9. Curb Weight information
    if curb_weight != 'Not Specified' and curb_weight != '-':
        training_data.append({
            "instruction": random.choice(instruction_variations["curb_weight"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The curb weight of the {model} is {curb_weight} kg."
        })

# Save generated training data to JSON
output_file = '../data/training/horsepower_speed_transmission_fuel_height_weight_training_data.json'
with open(output_file, 'w') as f:
    json.dump(training_data, f, indent=4)

output_file  # Return the file path for downloading the generated training data


'../data/training/horsepower_speed_transmission_fuel_height_weight_training_data.json'

In [8]:
import pandas as pd
import json
import random

# Load the CSV file with motorcycle data
csv_path = '../data/checkpoints/full_exhibition_motorcycle_data.csv'
df = pd.read_csv(csv_path)

# List to store generated training examples
training_data = []

# Define instruction and input variations for the features
instruction_variations = {
    "model": [
        "Tell me about the {model}.",
        "What is the {model}?",
        "Give me details on the {model}.",
        "Tell me about the model {model}.",
        "Can you describe the {model}?"
    ],
    "motor_type": [
        "What type of motor does the {model} have?",
        "Tell me about the motor of the {model}.",
        "What kind of motor is used in the {model}?",
        "Is the {model} equipped with a brushless motor?",
        "What is the motor type in the {model}?"
    ],
    "motor_power": [
        "How powerful is the motor in the {model}?",
        "What is the motor power of the {model}?",
        "Tell me the motor power of the {model}.",
        "What’s the power of the motor in the {model}?",
        "How many horsepower does the motor of the {model} have?"
    ],
    "dimensions": [
        "What are the dimensions of the {model}?",
        "Can you tell me the dimensions of the {model}?",
        "What is the size of the {model}?",
        "Tell me the length, width, and height of the {model}.",
        "How big is the {model}?"
    ],
    "seat_height": [
        "What is the seat height of the {model}?",
        "How tall is the seat of the {model}?",
        "Tell me the seat height of the {model}.",
        "How high is the seat on the {model}?",
        "What’s the seat height of the {model}?"
    ],
    "wheelbase": [
        "What is the wheelbase of the {model}?",
        "How long is the wheelbase of the {model}?",
        "Tell me about the wheelbase of the {model}.",
        "How far apart are the wheels on the {model}?",
        "What’s the wheelbase of the {model}?"
    ],
    "tyre_size": [
        "What is the tyre size of the {model}?",
        "Tell me the front and rear tyre sizes of the {model}.",
        "What size tyres are used in the {model}?",
        "What’s the tyre size of the {model}?",
        "Tell me the tyre size for the {model}."
    ],
    "max_speed": [
        "How fast can the {model} go?",
        "What’s the top speed of the {model}?",
        "How fast does the {model} go?",
        "What’s the max speed of the {model}?",
        "What is the maximum speed of the {model}?"
    ],
    "battery_capacity": [
        "What is the battery capacity of the {model}?",
        "How big is the battery in the {model}?",
        "Tell me about the battery capacity of the {model}.",
        "What’s the battery capacity of the {model}?",
        "How much power does the battery of the {model} have?"
    ],
    "charging_time": [
        "How long does it take to charge the {model}?",
        "What’s the charging time for the {model}?",
        "How much time does it take to fully charge the {model}?",
        "Tell me the charging time for the {model}.",
        "What’s the charge time for the {model}?"
    ],
    "brake_system": [
        "What type of brake system does the {model} have?",
        "Tell me about the brakes on the {model}.",
        "What kind of braking system is used in the {model}?",
        "Does the {model} have ABS or CBS?",
        "What’s the brake system like in the {model}?"
    ]
}

input_variations = [
    "",  # Empty input for standard cases
    "Customer asked for performance and safety details.",
    "Customer wants to know about the bike’s features.",
    "Customer is looking for specifications and handling details.",
    "Customer is interested in the design and power of the {model}."
]

# Generate training examples based on the features available in the dataset
for _, row in df.iterrows():
    # Extract key details
    model = row.get('Model Name', 'Unknown Model')
    motor_type = row.get('Motor Type', 'Not Specified')
    motor_power = row.get('Motor Power', 'Not Specified')
    dimensions = row.get('Dimensions (L×W×H)', 'Not Specified')
    seat_height = row.get('Seat Height', 'Not Specified')
    wheelbase = row.get('Wheelbase', 'Not Specified')
    tyre_size = row.get('Front & Rear Tyre Size', 'Not Specified')
    max_speed = row.get('Max Speed', 'Not Specified')
    battery_capacity = row.get('Battery Capacity', 'Not Specified')
    charging_time = row.get('Charging Time', 'Not Specified')
    brake_system = row.get('Brake System', 'Not Specified')

    # 1. Model Information
    training_data.append({
        "instruction": random.choice(instruction_variations["model"]).format(model=model),
        "input": random.choice(input_variations),
        "output": f"The {model} is a great option with great features."
    })
    
    # 2. Motor Type
    if motor_type != 'Not Specified':
        training_data.append({
            "instruction": random.choice(instruction_variations["motor_type"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The {model} has a {motor_type} motor."
        })
        
    # 3. Motor Power
    if motor_power != 'Not Specified':
        training_data.append({
            "instruction": random.choice(instruction_variations["motor_power"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The {model} has a motor power of {motor_power}."
        })
        
    # 4. Dimensions
    if dimensions != 'Not Specified':
        training_data.append({
            "instruction": random.choice(instruction_variations["dimensions"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The dimensions of the {model} are {dimensions}."
        })
        
    # 5. Seat Height
    if seat_height != 'Not Specified':
        training_data.append({
            "instruction": random.choice(instruction_variations["seat_height"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The seat height of the {model} is {seat_height}."
        })
        
    # 6. Wheelbase
    if wheelbase != 'Not Specified':
        training_data.append({
            "instruction": random.choice(instruction_variations["wheelbase"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The wheelbase of the {model} is {wheelbase}."
        })
        
    # 7. Tyre Size
    if tyre_size != 'Not Specified':
        training_data.append({
            "instruction": random.choice(instruction_variations["tyre_size"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The front and rear tyres of the {model} are {tyre_size}."
        })
        
    # 8. Max Speed
    if max_speed != 'Not Specified':
        training_data.append({
            "instruction": random.choice(instruction_variations["max_speed"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The max speed of the {model} is {max_speed}."
        })
        
            # 9. Battery Capacity
        if battery_capacity != 'Not Specified':
            training_data.append({
                "instruction": random.choice(instruction_variations["battery_capacity"]).format(model=model),
                "input": random.choice(input_variations),
                "output": f"The battery capacity of the {model} is {battery_capacity}."
            })
        
        # 10. Charging Time
        if charging_time != 'Not Specified':
            training_data.append({
                "instruction": random.choice(instruction_variations["charging_time"]).format(model=model),
                "input": random.choice(input_variations),
                "output": f"The charging time for the {model} is {charging_time}."
            })
        
        # 11. Brake System
        if brake_system != 'Not Specified':
            training_data.append({
                "instruction": random.choice(instruction_variations["brake_system"]).format(model=model),
                "input": random.choice(input_variations),
                "output": f"The {model} features a {brake_system} brake system."
            })

# Save generated training data to JSON
output_file = '../data/training/brochure_exhibition_training_data.json'
with open(output_file, 'w') as f:
    json.dump(training_data, f, indent=4)

output_file  # Return the file path for downloading the generated training data


'../data/training/brochure_exhibition_training_data.json'

In [13]:
import pandas as pd
import json
import random
from collections import defaultdict

# Load the CSV file with motorcycle data
csv_path = '../data/checkpoints/imputed_full_motorcycle_data.csv'
df = pd.read_csv(csv_path)

# List to store generated training examples
training_data = []

# Create price lookup dictionary
price_lookup = {}
for _, row in df.iterrows():
    if pd.notna(row.get('Reference Price (Yuan)')):
        try:
            price_lookup[row['Model Name']] = float(row['Reference Price (Yuan)'])
        except ValueError:
            continue

# Create lookup dictionaries for reverse searches
spec_to_models = defaultdict(list)
for _, row in df.iterrows():
    model = row['Model Name']
    
    # Build reverse lookup dictionaries for all relevant specifications
    specs_mapping = {
        'engine': 'Engine',
        'speed': 'Max Speed (km/h)',
        'power': 'Max Power/Speed (kW/rpm)',
        'fuel': 'Fuel Tank Capacity (L)',
        'weight': 'Curb Weight (kg)',
        'price': 'Reference Price (Yuan)'
    }
    
    for spec_key, column_name in specs_mapping.items():
        if pd.notna(row.get(column_name)):
            try:
                # Convert to float if possible for range comparisons
                value = float(str(row[column_name]).split()[0])  # Take first number if multiple values
                spec_to_models[f"{spec_key}"].append((model, value))
            except ValueError:
                # For non-numeric values like engine types
                spec_to_models[f"{spec_key}_{row[column_name]}"].append((model, row[column_name]))

def get_top_5_by_price(model_list):
    """Return the 5 most expensive models from the given list"""
    models_with_prices = [(model, val, price_lookup.get(model, 0)) 
                         for model, val in model_list]
    sorted_models = sorted(models_with_prices, key=lambda x: x[2], reverse=True)
    return [(model, val) for model, val, _ in sorted_models[:5]]

def get_models_in_range(spec_list, target_value, range_percent=0.15):
    """Get models within a percentage range of the target value"""
    min_val = target_value * (1 - range_percent)
    max_val = target_value * (1 + range_percent)
    models = [(model, val) for model, val in spec_list 
              if min_val <= val <= max_val]
    return get_top_5_by_price(models)

def get_models_above(spec_list, target_value):
    """Get models above the target value"""
    models = [(model, val) for model, val in spec_list 
              if val >= target_value]
    return get_top_5_by_price(models)

def get_models_below(spec_list, target_value):
    """Get models below the target value"""
    models = [(model, val) for model, val in spec_list 
              if val <= target_value]
    return get_top_5_by_price(models)

# Define instruction variations for reverse lookups with ranges
instruction_variations = {
    "find_by_engine": [
        "Which models have an engine like {value}?",
        "What motorcycles have similar engine size to {value}?",
        "Show me premium bikes with engine capacity near {value}.",
        "List luxury motorcycles with approximately {value} engines.",
        "Which high-end bikes have engines like {value}?"
    ],
    "find_by_speed": [
        "Which models can go {value}?",
        "What premium motorcycles have a top speed above {value}?",
        "Show me luxury bikes that can exceed {value}.",
        "List high-end models with speed capabilities over {value}.",
        "Which top-tier bikes can achieve speeds greater than {value}?"
    ],
    "find_by_power": [
        "Which models have power output like {value}?",
        "What premium motorcycles offer similar power to {value}?",
        "Show me luxury bikes with power ratings near {value}.",
        "List high-end models with power output close to {value}.",
        "Which top-tier bikes have power levels like {value}?"
    ],
    "find_by_price": [
        "Which models cost like {value}?",
        "What premium motorcycles are priced near {value}?",
        "Show me luxury bikes in the {value} range.",
        "List high-end models priced around {value}.",
        "Which top-tier bikes cost approximately {value}?"
    ],
    "find_by_fuel": [
        "Which models have fuel capacity like {value}?",
        "What premium motorcycles have similar tank size to {value}?",
        "Show me luxury bikes with fuel tanks near {value}.",
        "List high-end models with fuel capacity close to {value}.",
        "Which top-tier bikes have tank sizes like {value}?"
    ]
}

input_variations = [
    "Customer is looking for premium motorcycles within this specification range.",
    "Customer wants to compare high-end models with similar specifications.",
    "Customer needs luxury options around this specification.",
    "Customer is interested in top-tier models meeting these criteria.",
    "Customer wants to see the most expensive models in this category."
]

# Generate reverse lookup training data
for spec_type in ['engine', 'speed', 'power', 'price', 'fuel']:
    if spec_type in spec_to_models:
        models = spec_to_models[spec_type]
        
        # Get some representative values to generate examples
        values = sorted([val for _, val in models])
        sample_values = [
            values[int(len(values)*0.25)],  # 25th percentile
            values[int(len(values)*0.5)],   # median
            values[int(len(values)*0.75)]    # 75th percentile
        ]
        
        for target_value in sample_values:
            # Generate examples for different range types
            range_models = get_models_in_range(models, target_value)
            above_models = get_models_above(models, target_value)
            below_models = get_models_below(models, target_value)
            
            units = {'engine': 'cc', 'speed': 'km/h', 'power': 'kW', 
                    'price': 'Yuan', 'fuel': 'L'}
            unit = units[spec_type]
            
            # Add training examples for each type of range query
            if len(range_models) > 1:
                training_data.append({
                    "instruction": random.choice(instruction_variations[f"find_by_{spec_type}"]).format(
                        value=int(target_value)
                    ),
                    "input": random.choice(input_variations),
                    "output": f"Here are the top 5 models with {spec_type} around {target_value}{unit} "
                             f"(±15%): {', '.join([f'{model} ({val:.1f}{unit})' for model, val in range_models])}."
                })
            
            if len(above_models) > 1:
                training_data.append({
                    "instruction": f"Which premium models have {spec_type} higher than {target_value}{unit}?",
                    "input": random.choice(input_variations),
                    "output": f"Here are the top 5 models with {spec_type} above {target_value}{unit}: "
                             f"{', '.join([f'{model} ({val:.1f}{unit})' for model, val in above_models])}."
                })
            
            if len(below_models) > 1:
                training_data.append({
                    "instruction": f"Which luxury models have {spec_type} lower than {target_value}{unit}?",
                    "input": random.choice(input_variations),
                    "output": f"Here are the top 5 models with {spec_type} below {target_value}{unit}: "
                             f"{', '.join([f'{model} ({val:.1f}{unit})' for model, val in below_models])}."
                })

# Save generated training data to JSON
output_file = '../data/training/reverse_lookup_premium_top5_training_data.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(training_data, f, indent=4, ensure_ascii=False)

print(f"Generated {len(training_data)} training examples")
print(f"Saved to: {output_file}")

# Display a few random examples
print("\nExample training items:")
for item in random.sample(training_data, min(3, len(training_data))):
    print("\nInstruction:", item["instruction"])
    print("Input:", item["input"])
    print("Output:", item["output"])

Generated 27 training examples
Saved to: ../data/training/reverse_lookup_premium_top5_training_data.json

Example training items:

Instruction: List high-end models with speed capabilities over 95.
Input: Customer needs luxury options around this specification.
Output: Here are the top 5 models with speed around 95.0km/h (±15%): 轻骑 U俠一代 QM125-3XI (95.0km/h), 柯瓦勒 奥丁 Odin (95.0km/h), 古思特 公爵 GST125T-12A (95.0km/h), 柯瓦勒 提尔 Tyr (95.0km/h), 柯瓦勒 雷神 Thor (95.0km/h).

Instruction: What premium motorcycles are priced near 19980?
Input: Customer wants to compare high-end models with similar specifications.
Output: Here are the top 5 models with price around 19980.0Yuan (±15%): 波速尔 XT-1 (22800.0Yuan), 宝雕 BD 350 (22800.0Yuan), 宝雕 BD400RR (22800.0Yuan), 恒舰 Z300 (22800.0Yuan), 大力神 F7 (22800.0Yuan).

Instruction: Which luxury models have fuel lower than 4.5L?
Input: Customer is interested in top-tier models meeting these criteria.
Output: Here are the top 5 models with fuel below 4.5L: 轻骑 U俠一代 QM125-3

In [14]:
import json
import glob

# Get all JSON files in the training data directory
json_files = glob.glob('../data/training/*.json')

# Initialize empty list to store all training data
combined_training_data = []

# Read and combine data from each file
for file_path in json_files:
    with open(file_path, 'r') as f:
        data = json.load(f)
        combined_training_data.extend(data)

# Save combined data to a new JSON file
output_file = '../data/training/combined_training_data.json'
with open(output_file, 'w') as f:
    json.dump(combined_training_data, f, indent=4)

print(f"Combined {len(json_files)} files into {output_file}")
print(f"Total number of training examples: {len(combined_training_data)}")


Combined 7 files into ../data/training/combined_training_data.json
Total number of training examples: 72655


In [15]:
import pandas as pd

# Convert the combined training data to a DataFrame
df = pd.DataFrame(combined_training_data)

# Create checkpoints directory if it doesn't exist
import os
if not os.path.exists('../data/training_checkpoints'):
    os.makedirs('../data/training_checkpoints')

# Save DataFrame to CSV in checkpoints directory
checkpoint_path = '../data/training_checkpoints/training_data.csv'
df.to_csv(checkpoint_path, index=False)

print(f"\nDataFrame saved to: {checkpoint_path}")
print("\nFirst few rows of the DataFrame:")
print(df.head())

print("\nDataFrame Info:")
print(df.info())




DataFrame saved to: ../data/training_checkpoints/training_data.csv

First few rows of the DataFrame:
                                         instruction  \
0                 Tell me about the 钛极 BOHEMIA/波西米亚.   
1      What is the max speed of the 钛极 BOHEMIA/波西米亚?   
2   How big is the fuel tank on the 钛极 BOHEMIA/波西米亚?   
3  Can you tell me about the 钛极 BOHEMIA/波西米亚's en...   
4    Share some key features of the 钛极 BOHEMIA/波西米亚.   

                                               input  \
0  Customer is interested in motorcycle performance.   
1  Customer wants to know the main features and p...   
2                                                      
3  Customer requested details on fuel and engine ...   
4  Customer requested details on fuel and engine ...   

                                              output  
0  The 钛极 BOHEMIA/波西米亚 measures 1970x770x1150. It...  
1  The 钛极 BOHEMIA/波西米亚 can reach a top speed of 1...  
2  The fuel tank on the 钛极 BOHEMIA/波西米亚 holds 8.0...  
3  T