In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("/content/cars_kolesa.csv")
data

Unnamed: 0,name,price,price_raw,desc,link
0,Kia K5,7300000,7 300 000 ₸,"2015 г., Б/у седан, 2 л, газ, КПП автомат, с п...",https://kolesa.kz/a/show/206857918?search_id=6...
1,Lexus RX 300 F Sport Design,17750000,17 750 000 ₸,"2018 г., Б/у кроссовер, 2 л, бензин, КПП автом...",https://kolesa.kz/a/show/205587148?search_id=6...
2,Li L9 Ultra,31500000,31 500 000 ₸,"На заказ, 2025 г., новый кроссовер, 1.5 л, бен...",https://kolesa.kz/a/show/184418014?search_id=6...
3,Hyundai Bayon,8400000,8 400 000 ₸,"2023 г., Б/у кроссовер, 1.4 л, бензин, КПП авт...",https://kolesa.kz/a/show/198131286?search_id=6...
4,Mercedes-Benz GLC Coupe 300 4MATIC,60500000,60 500 000 ₸,"2025 г., новый кроссовер, 2 л, бензин, КПП авт...",https://kolesa.kz/a/show/203740036?search_id=6...
...,...,...,...,...,...
3994,Kia K5,6850000,6 850 000 ₸,"2010 г., Б/у седан, 1.8 л, бензин, КПП автомат...",https://kolesa.kz/a/show/206384703?fromUserSea...
3995,Hyundai Tucson,11500000,11 500 000 ₸,"2018 г., Б/у кроссовер, 2 л, бензин, КПП автом...",https://kolesa.kz/a/show/205513395?search_id=8...
3996,Kia K5,7800000,7 800 000 ₸,"2011 г., Б/у седан, 2 л, бензин, КПП автомат, ...",https://kolesa.kz/a/show/206384561?search_id=8...
3997,Kia Rio,5400000,5 400 000 ₸,"2013 г., Б/у седан, 1.6 л, бензин, КПП автомат...",https://kolesa.kz/a/show/206384373?fromUserSea...


2014 г., Б/у седан, 1.5 л, бензин, КПП механика, Комплектация LUX Гур и электрпакет 4 двери есть Зимние шины с хорошим...


In [None]:
import pandas as pd
import re

INPUT_CSV = "cars_kolesa.csv"
OUTPUT_CSV = "cars_kolesa_parsed.csv"


# ---- список многословных брендов ----
MULTI_WORD_BRANDS = [
    "mercedes-benz",
    "land rover",
    "alfa romeo",
    "rolls royce",
    "aston martin",
    "great wall",
    "dongfeng motor",
]


def parse_name(name: str):
    if not isinstance(name, str):
        return None, None

    text = name.strip()
    text_low = text.lower()

    # 1) проверяем многословные бренды
    for brand in MULTI_WORD_BRANDS:
        if text_low.startswith(brand):
            brand_clean = brand.title().replace("-", "-")
            model = text[len(brand):].strip()
            return brand_clean, model if model else None

    # 2) обычный случай: первое слово — бренд
    parts = text.split()
    if len(parts) == 1:
        return parts[0], None

    brand = parts[0]
    model = " ".join(parts[1:])
    return brand, model


def parse_desc(desc: str) -> dict:
    if not isinstance(desc, str):
        return {
            "condition": None,
            "year": None,
            "mileage_km": None,
            "engine_liters": None,
            "fuel": None,
            "transmission": None,
            "drive": None,
        }

    text = desc.lower()

    # ---- состояние ----
    condition = "unknown"
    if "б/у" in text or "бу" in text:
        condition = "used"
    elif "нов" in text:
        condition = "new"

    # ---- год ----
    year = None
    m = re.search(r"(19\d{2}|20\d{2})\s*г", text)
    if m:
        year = int(m.group(1))

    # ---- пробег ----
    mileage = None
    m = re.search(r"([\d\s]+)\s*км", text)
    if m:
        mileage = int(m.group(1).replace(" ", ""))

    # ---- объем двигателя ----
    engine = None
    m = re.search(r"(\d\.\d)\s*л", text)
    if m:
        engine = float(m.group(1))

    # ---- топливо ----
    fuel = None
    if "бензин" in text:
        fuel = "petrol"
    elif "дизел" in text:
        fuel = "diesel"
    elif "газ" in text:
        fuel = "gas"
    elif "гибрид" in text:
        fuel = "hybrid"
    elif "электро" in text:
        fuel = "electric"

    # ---- коробка ----
    transmission = None
    if "автомат" in text:
        transmission = "automatic"
    elif "механ" in text:
        transmission = "manual"
    elif "вариатор" in text:
        transmission = "cvt"
    elif "робот" in text:
        transmission = "robot"

    # ---- привод ----
    drive = None
    if "передний" in text:
        drive = "fwd"
    elif "задний" in text:
        drive = "rwd"
    elif "полный" in text or "4wd" in text:
        drive = "awd"

    return {
        "condition": condition,
        "year": year,
        "mileage_km": mileage,
        "engine_liters": engine,
        "fuel": fuel,
        "transmission": transmission,
        "drive": drive,
    }


def main():
    df = pd.read_csv(INPUT_CSV)

    # ---- brand / model ----
    brands_models = df["name"].apply(parse_name).apply(pd.Series)
    brands_models.columns = ["brand", "model"]

    # ---- desc parsing ----
    desc_parsed = df["desc"].apply(parse_desc).apply(pd.Series)

    df_out = pd.concat([df, brands_models, desc_parsed], axis=1)

    df_out.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")

    print("Saved:", OUTPUT_CSV)
    print(df_out[["name", "brand", "model", "condition", "year"]].head(10))


if __name__ == "__main__":
    main()


Saved: cars_kolesa_parsed.csv
                                 name          brand                  model  \
0                              Kia K5            Kia                     K5   
1         Lexus RX 300 F Sport Design          Lexus  RX 300 F Sport Design   
2                         Li L9 Ultra             Li               L9 Ultra   
3                       Hyundai Bayon        Hyundai                  Bayon   
4  Mercedes-Benz GLC Coupe 300 4MATIC  Mercedes-Benz   GLC Coupe 300 4MATIC   
5                     Volkswagen Polo     Volkswagen                   Polo   
6                      Mitsubishi RVR     Mitsubishi                    RVR   
7                       Changan UNI-V        Changan                  UNI-V   
8                          ГАЗ ГАЗель            ГАЗ                 ГАЗель   
9           Deepal S07 Max ADS SE 215         Deepal     S07 Max ADS SE 215   

  condition  year  
0      used  2015  
1      used  2018  
2       new  2025  
3      used  2023  


In [None]:
data = pd.read_csv("cars_kolesa_parsed.csv")
data

Unnamed: 0,name,price,price_raw,desc,link,brand,model,condition,year,mileage_km,engine_liters,fuel,transmission,drive
0,Kia K5,7300000,7 300 000 ₸,"2015 г., Б/у седан, 2 л, газ, КПП автомат, с п...",https://kolesa.kz/a/show/206857918?search_id=6...,Kia,K5,used,2015,94000.0,,gas,automatic,
1,Lexus RX 300 F Sport Design,17750000,17 750 000 ₸,"2018 г., Б/у кроссовер, 2 л, бензин, КПП автом...",https://kolesa.kz/a/show/205587148?search_id=6...,Lexus,RX 300 F Sport Design,used,2018,148057.0,,petrol,automatic,
2,Li L9 Ultra,31500000,31 500 000 ₸,"На заказ, 2025 г., новый кроссовер, 1.5 л, бен...",https://kolesa.kz/a/show/184418014?search_id=6...,Li,L9 Ultra,new,2025,50.0,1.5,petrol,automatic,
3,Hyundai Bayon,8400000,8 400 000 ₸,"2023 г., Б/у кроссовер, 1.4 л, бензин, КПП авт...",https://kolesa.kz/a/show/198131286?search_id=6...,Hyundai,Bayon,used,2023,38212.0,1.4,petrol,automatic,
4,Mercedes-Benz GLC Coupe 300 4MATIC,60500000,60 500 000 ₸,"2025 г., новый кроссовер, 2 л, бензин, КПП авт...",https://kolesa.kz/a/show/203740036?search_id=6...,Mercedes-Benz,GLC Coupe 300 4MATIC,new,2025,,,petrol,automatic,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3994,Kia K5,6850000,6 850 000 ₸,"2010 г., Б/у седан, 1.8 л, бензин, КПП автомат...",https://kolesa.kz/a/show/206384703?fromUserSea...,Kia,K5,used,2010,301341.0,1.8,petrol,automatic,
3995,Hyundai Tucson,11500000,11 500 000 ₸,"2018 г., Б/у кроссовер, 2 л, бензин, КПП автом...",https://kolesa.kz/a/show/205513395?search_id=8...,Hyundai,Tucson,used,2018,136000.0,,petrol,automatic,
3996,Kia K5,7800000,7 800 000 ₸,"2011 г., Б/у седан, 2 л, бензин, КПП автомат, ...",https://kolesa.kz/a/show/206384561?search_id=8...,Kia,K5,used,2011,147592.0,,petrol,automatic,
3997,Kia Rio,5400000,5 400 000 ₸,"2013 г., Б/у седан, 1.6 л, бензин, КПП автомат...",https://kolesa.kz/a/show/206384373?fromUserSea...,Kia,Rio,used,2013,221640.0,1.6,petrol,automatic,


In [None]:

unique_values = data['brand'].unique()
print(unique_values)

['Kia' 'Lexus' 'Li' 'Hyundai' 'Mercedes-Benz' 'Volkswagen' 'Mitsubishi'
 'Changan' 'ГАЗ' 'Deepal' 'ВАЗ' 'Toyota' 'BMW' 'Opel' 'Porsche' 'Honda'
 'Skoda' 'Jeep' 'Land Rover' 'Nissan' 'Chevrolet' 'BYD' 'FAW' 'Audi'
 'Suzuki' 'Geely' 'Infiniti' 'Daewoo' 'Renault' 'Subaru' 'Great Wall'
 'Mazda' 'Volvo' 'SsangYong' 'EXEED' 'Fiat' 'Chery' 'Haval' 'Baojun'
 'Zeekr' 'Rox' 'Genesis' 'Voyah' 'Lynk' 'OMODA' 'Jaecoo' 'Denza' 'УАЗ'
 'Chrysler' 'JAC' 'Ravon' 'Tesla' 'Jetour' 'Ford' 'Cadillac' 'Mini' 'MG'
 'ЗАЗ' 'Lifan' 'Hummer' 'Tank' 'GAC' 'Leapmotor' 'AITO' 'Hongqi' 'Jaguar'
 'Foton' 'Peugeot' 'Dodge' 'Kaiyi' 'Niutron' 'Roewe' 'Mercedes-Maybach'
 'BAIC' 'Bentley']
