Available columns: ['Model Name', 'Reference Price (Yuan)', 'Brand', 'Model', 'Production Method', 'Origin', 'Engine', 'Max Horsepower (Ps)', 'Max Power/Speed (kW/rpm)', 'Transmission', 'ABS', 'CBS', 'Dimensions (LxWxH mm)', 'Seat Height (mm)', 'Curb Weight (kg)', 'Fuel Tank Capacity (L)', 'Max Speed (km/h)', 'Official Average Fuel Consumption (L/100km)', 'Range (km)', 'Available Colors', 'Max Horsepower (Ps)_is_imputed', 'Seat Height (mm)_is_imputed', 'Curb Weight (kg)_is_imputed', 'Fuel Tank Capacity (L)_is_imputed', 'Max Speed (km/h)_is_imputed']

Sample data:
{'Model Name': '钛极 NEXY+/未界', 'Reference Price (Yuan)': 'Currently no quotation available', 'Brand': 'titanium', 'Model': 'Pedal', 'Production Method': 'domestic', 'Origin': 'Not Specified', 'Engine': 'single cylinder four stroke water-cooled 150cc', 'Max Horsepower (Ps)': 15.8, 'Max Power/Speed (kW/rpm)': '11.6/8500', 'Transmission': 'Not Specified', 'ABS': 'CBS', 'CBS': 'Not specified', 'Dimensions (LxWxH mm)': '1970x770x115

In [12]:
import pandas as pd
import numpy as np
import json
import random
import os

# Part 1: Preprocessing (following notebook 2's pattern)
# Load the second motorcycle sheet
df = pd.read_csv('../data/formatted/motorcycle2_specs.csv')

# Calculate percentage of NaN values
nan_percentages = (df.isna().sum() / len(df)) * 100

# Drop columns with more than 90% NaN values
threshold = 0.9  # 90% threshold
columns_to_drop = nan_percentages[nan_percentages > 90].index
df = df.drop(columns=columns_to_drop)

# Remove rows where price is NaN
price_col = 'Reference Price (Yuan)'
initial_rows = len(df)
df = df.dropna(subset=[price_col])
rows_removed = initial_rows - len(df)

# Replace brand values less than 3 characters with 'Not Specified'
df.loc[df['Brand'].fillna('').str.len() < 3, 'Brand'] = 'Not Specified'

# Capitalize model values
df['Model'] = df['Model'].str.capitalize()

# Create imputation flags and impute missing values
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Create imputation flag columns for numeric columns with >10% missing values
for col in numeric_columns:
    if nan_percentages[col] > 10:
        flag_col = f"{col}_is_imputed"
        df[flag_col] = df[col].isna()
        df[col] = df[col].fillna(df[col].median())

# Fill categorical columns with 'Not Specified'
for col in categorical_columns:
    df[col] = df[col].fillna('Not Specified')

# Save preprocessed data
df.to_csv('../data/checkpoints/imputed_motorcycle_2_data.csv', index=False)

# Part 2: Generate Training Data (following notebook 3's pattern)
training_data = []

# Define instruction variations (using the same as in notebook 3)
instruction_variations = {
    "model_info": [
        "What can you tell me about the {model}?",
        "Give me information about the {model}.",
        "What are the specifications of the {model}?",
        "What are the main features of the {model}?",
        "Tell me about the {model}."
    ],
    "top_speed": [
        "What is the top speed of the {model}?",
        "How fast can the {model} go?",
        "Tell me the maximum speed of the {model}.",
        "What's the highest speed of the {model}?",
        "What speed can the {model} achieve?"
    ],
    "fuel_capacity": [
        "What is the fuel capacity of the {model}?",
        "How big is the fuel tank on the {model}?",
        "Tell me about the fuel tank capacity of the {model}.",
        "How much fuel can the {model} hold?",
        "What's the fuel tank size of the {model}?"
    ],
    "engine_info": [
        "What type of engine does the {model} have?",
        "Tell me about the engine in the {model}.",
        "What are the engine specifications of the {model}?",
        "What engine is in the {model}?",
        "Can you describe the engine of the {model}?"
    ]
}

input_variations = [
    "",  # Empty input for standard cases
    "Customer is interested in motorcycle performance.",
    "Customer wants to know the main features and performance.",
    "Customer asked about the motorcycle's speed and handling.",
    "Customer is looking for an overview of the motorcycle.",
    "Customer requested details on fuel and engine specs."
]

# Generate training examples
for _, row in df.iterrows():
    model = row.get('Model Name', 'Unknown Model')
    dimensions = row.get('Dimensions (LxWxH mm)', 'N/A')
    top_speed = row.get('Max Speed (km/h)', 'N/A')
    fuel_capacity = row.get('Fuel Tank Capacity (L)', 'N/A')
    engine_type = row.get('Engine', 'N/A')
    
    # Generate examples following the same pattern as notebook 3
    if not any(pd.isna([dimensions, top_speed])):
        training_data.append({
            "instruction": random.choice(instruction_variations["model_info"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The {model} measures {dimensions}."
        })
    
    if not pd.isna(top_speed):
        training_data.append({
            "instruction": random.choice(instruction_variations["top_speed"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The {model} has a top speed of {top_speed}{ ' km/h' if top_speed != 'Not Specified' else ''}"
        })
    
    if not pd.isna(fuel_capacity):
        training_data.append({
            "instruction": random.choice(instruction_variations["fuel_capacity"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The fuel tank on the {model} holds {fuel_capacity}."
        })
    
    if not pd.isna(engine_type):
        training_data.append({
            "instruction": random.choice(instruction_variations["engine_info"]).format(model=model),
            "input": random.choice(input_variations),
            "output": f"The {model} comes with a {engine_type} engine."
        })

# Save training data
output_file = '../data/training/motorcycle_2_training_data.json'
with open(output_file, 'w') as f:
    json.dump(training_data, f, indent=4)

print(f"Preprocessing and training data generation complete. Files saved to:")
print(f"1. Preprocessed data: ../data/checkpoints/imputed_motorcycle_2_data.csv")
print(f"2. Training data: {output_file}")

# Convert training data to CSV format
import pandas as pd

# Convert training data list to DataFrame
df_training = pd.DataFrame(training_data)

# Save as CSV
csv_output_file = '../data/training_checkpoints/motorcycle_2_training_data.csv'
df_training.to_csv(csv_output_file, index=False)

print(f"3. Training data CSV: {csv_output_file}")


Preprocessing and training data generation complete. Files saved to:
1. Preprocessed data: ../data/checkpoints/imputed_motorcycle_2_data.csv
2. Training data: ../data/training/motorcycle_2_training_data.json
3. Training data CSV: ../data/training_checkpoints/motorcycle_2_training_data.csv
