In [1]:
import pandas as pd
import numpy as np
import random

# 1. 定义数据生成规则 (品牌与型号对应，看起来更真实)
brands = ["Toyota", "Mazada", "VW", "BMW", "Honda", "Volvo", "Ford", "Tesla", "GMC", "Nissan"]
models = {
    "Toyota": ["Corolla", "Camry"], "Mazada": ["CX5", "3"], "VW": ["Jetta", "Golf"],
    "BMW": ["X3", "320i"], "Honda": ["Civic", "Accord"], "Volvo": ["XC60", "S60"],
    "Ford": ["Focus", "Mustang"], "Tesla": ["Model 3", "Model Y"], "GMC": ["Sierra", "Yukon"],
    "Nissan": ["Sentra", "Altima"]
}
cats = ["A", "B"]

# 2. 生成 100 行基础数据
data = []
for _ in range(100):
    brand = random.choice(brands)
    model = random.choice(models[brand])
    year = random.randint(2000, 2023)
    coe = random.randint(20000, 100000)
    cat = random.choice(cats)
    price = random.randint(40000, 150000)
    data.append([brand, model, year, coe, cat, price])

df = pd.DataFrame(data, columns=["Brand", "Model", "Year", "COE", "Cat", "Price"])

# 3. 随机引入缺失值 (每列约 10% 的数据设为 NaN/空值)
for col in df.columns:
    # 随机选择 10% 的行索引，并将该列对应位置设为 NaN
    df.loc[df.sample(frac=0.1).index, col] = np.nan

# 4. 保存为指定文件名的文件
df.to_csv('Cars.csv', index=False)
df.to_json('Cars.json', orient='records')

print("成功生成 'Cars.csv' 和 'Cars.json' (包含 100 行数据及随机缺失值)。")

# 5. 验证读取并打印前 5 行
df_check = pd.read_csv('Cars.csv')
print(df_check.head().to_string())
print("\n缺失值统计：")
print(df_check.isna().sum())

成功生成 'Cars.csv' 和 'Cars.json' (包含 100 行数据及随机缺失值)。
    Brand    Model    Year      COE  Cat     Price
0   Volvo     XC60  2018.0  34258.0    B  114587.0
1     BMW       X3  2018.0      NaN    A   46888.0
2    Ford  Mustang  2007.0  95463.0    A   45126.0
3  Toyota  Corolla  2010.0      NaN    A  120827.0
4     NaN    Camry  2009.0  52119.0  NaN   76092.0

缺失值统计：
Brand    10
Model    10
Year     10
COE      10
Cat      10
Price    10
dtype: int64
