In [1]:
import re
import json
import os
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Scraping cars data from Avito.ma

In [2]:

BASE_URL = "https://www.avito.ma"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}

def clean_string(text):
    if not isinstance(text, str):
        return text

    # Remove XML-invalid characters
    text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F\uD800-\uDFFF\uFFFE\uFFFF]', '', text)
    text = text.replace('\ufffd', ' ').strip()
    text = ''.join(c for c in text if ord(c) <= 0xFFFF)
    return text

def clean_car_data(car_info):
    cleaned = {}
    for key, value in car_info.items():
        if isinstance(value, str):
            cleaned[key] = clean_string(value)
        elif isinstance(value, dict):
            cleaned[key] = clean_car_data(value)
        elif isinstance(value, list):
            cleaned[key] = [clean_string(item) if isinstance(item, str) else item for item in value]
        else:
            cleaned[key] = value
    return cleaned

def scrape_cars(start_page=1, end_page=1):
    print("start scraping")
    all_cars = []

    for page in range(start_page, end_page + 1):
        url = f"{BASE_URL}/fr/maroc/voiture?o={page}" if page > 1 else f"{BASE_URL}/fr/maroc/voiture"
        
        try:
            response = requests.get(url, headers=HEADERS)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            script = soup.find('script', id='__NEXT_DATA__')

            if not script:
                print(f"No script tag found on page {page}")
                continue

            data = json.loads(script.string)
            cars_url = data["props"]["pageProps"]["initialReduxState"]["ad"]["search"]["ads"]

            for car in cars_url:
                car_url = urljoin(BASE_URL, car.get("href", "")) if car.get("href") else None
                if not car_url:
                    print(f"No car URL found.")
                    continue

                try:
                    car_response = requests.get(car_url, headers=HEADERS)
                    car_response.raise_for_status()
                    car_soup = BeautifulSoup(car_response.text, 'html.parser')
                    car_script = car_soup.find('script', id='__NEXT_DATA__')

                    if not car_script:
                        print(f"No data found for car at {car_url}")
                        continue

                    car_data_script = json.loads(car_script.string)
                    car_data = car_data_script["props"]["pageProps"]["initialReduxState"]["ad"]["view"]["adInfo"]

                    params = car_data.get("params", {})
                    
                    primary_params = {}
                    for item in params.get("primary", []):
                        key = clean_string(item.get("key")) if item.get("key") else "null"
                        value = clean_string(item.get("value")) if item.get("value") else "null"
                        primary_params[key] = value
                    
                    secondary_params = {}
                    for item in params.get("secondary", []):
                        key = clean_string(item.get("key")) if item.get("key") else "null"
                        value = clean_string(item.get("value")) if item.get("value") else "null"
                        secondary_params[key] = value
                    
                    extra_params = {}
                    for item in params.get("extra", []):
                        key = clean_string(item.get("key")) if item.get("key") else "null"
                        value = clean_string(item.get("value")) if item.get("value") else "null"
                        extra_params[key] = value
                    
                    car_info = {
                        "id": clean_string(str(car_data.get("id", "null"))),
                        "list_id": clean_string(str(car_data.get("listId", "null"))),
                        "title": clean_string(car_data.get("subject", "null")),
                        "description": clean_string(car_data.get("description", "null")),
                        "price": clean_string(str(car_data.get("price", {}).get("value", "null"))),
                        "old_price": clean_string(str(car_data.get("old_price", {}).get("value", "null"))),
                        "location": clean_string(car_data.get("location", {}).get("city", {}).get("name", "null")),
                        "ad_type": clean_string(car_data.get("type", {}).get("label", "null")),
                        "category": clean_string(car_data.get("category", {}).get("name", "null")),
                        "phone": clean_string(str(car_data.get("phone", "null"))),
                        "seller_name": clean_string(car_data.get("seller", {}).get("name", "null")),
                        "seller_type": clean_string(car_data.get("seller", {}).get("type", "null")),
                        "seller_verified": car_data.get("seller", {}).get("isVerifiedSeller", False),

                        "type": clean_string(primary_params.get("SELL", "null")),
                        "sector": clean_string(primary_params.get("205", "null")),
                        "mileage": clean_string(primary_params.get("mileage", "null")),
                        "brand": clean_string(primary_params.get("brand", "null")),
                        "model": clean_string(primary_params.get("model", "null")),
                        "doors": clean_string(primary_params.get("doors", "null")),
                        "origin": clean_string(primary_params.get("v_origin", "null")),
                        "first_owner": clean_string(primary_params.get("first_owner", "null")),
                        "fiscal_power": clean_string(primary_params.get("pfiscale", "null")),
                        "condition": clean_string(primary_params.get("auto_condition", "null")),

                        "year": clean_string(secondary_params.get("regdate", "null")),
                        "gearbox": clean_string(secondary_params.get("bv", "null")),
                        "fuel_type": clean_string(secondary_params.get("fuel", "null")),

                        "abs": clean_string(extra_params.get("car_abs", "null")),
                        "airbags": clean_string(extra_params.get("car_airbags", "null")),
                        "audio_system": clean_string(extra_params.get("cd_mp3_bt", "null")),
                        "rear_camera": clean_string(extra_params.get("car_reverse_camera", "null")),
                        "ac": clean_string(extra_params.get("car_ac", "null")),
                        "esp": clean_string(extra_params.get("car_esp", "null")),
                        "alloy_wheels": clean_string(extra_params.get("car_rims", "null")),
                        "speed_limiter": clean_string(extra_params.get("car_speed_limiter", "null")),
                        "onboard_computer": clean_string(extra_params.get("car_onboard_computer", "null")),
                        "rear_radar": clean_string(extra_params.get("car_reverse_radar", "null")),
                        "cruise_control": clean_string(extra_params.get("car_cruise_control", "null")),
                        "leather_seats": clean_string(extra_params.get("car_leather", "null")),
                        "navigation": clean_string(extra_params.get("car_navigation", "null")),
                        "sunroof": clean_string(extra_params.get("car_roof", "null")),
                        "central_locking": clean_string(extra_params.get("car_central_locking", "null")),
                        "electric_windows": clean_string(extra_params.get("car_electric_windows", "null"))
                    }

                    all_cars.append(car_info)

                except Exception as e:
                    print(f"Error processing car at {car_url}: {str(e)}")
                    continue

            print(f"Page {page} done: {len(cars_url)} cars found.")
            time.sleep(2)

        except Exception as e:
            print(f"Error on page {page}: {str(e)}")

    return all_cars

# Run scraper
cars_data = scrape_cars(start_page=1, end_page=600)

# Save to Excel
output_file = "avito_cars.xlsx"

try:
    if os.path.exists(output_file):
        existing_data = pd.read_excel(output_file)
        all_data = pd.concat([existing_data, pd.DataFrame(cars_data)], ignore_index=True)
    else:
        all_data = pd.DataFrame(cars_data)

    # Clean all string fields in one pass
    for column in all_data.select_dtypes(include='object').columns:
        all_data[column] = all_data[column].apply(lambda x: clean_string(str(x)) if pd.notnull(x) else x)

    # Optionally remove duplicates
    all_data.drop_duplicates(subset=["id"], inplace=True)

    all_data.to_excel(output_file, index=False)
    print(f"Saved {len(all_data)} cars to Excel!")

except Exception as e:
    print(f"Error saving to Excel: {str(e)}")

# Detect rows with invalid characters (optional)
def find_invalid_excel_chars(s):
    if not isinstance(s, str):
        return False
    return bool(re.search(r'[\x00-\x08\x0B\x0C\x0E-\x1F\uD800-\uDFFF\uFFFE\uFFFF]', s))

bad_rows = all_data.map(find_invalid_excel_chars).any(axis=1)
print("Rows with bad characters:", all_data[bad_rows])

start scraping
Page 501 done: 35 cars found.
Page 502 done: 35 cars found.
Page 503 done: 35 cars found.
Page 504 done: 35 cars found.
Page 505 done: 35 cars found.
Page 506 done: 35 cars found.
Page 507 done: 35 cars found.
Page 508 done: 35 cars found.
Page 509 done: 35 cars found.
Page 510 done: 35 cars found.
Page 511 done: 35 cars found.
Page 512 done: 35 cars found.
Page 513 done: 35 cars found.
Page 514 done: 35 cars found.
Page 515 done: 35 cars found.
Page 516 done: 35 cars found.
Page 517 done: 35 cars found.
Page 518 done: 35 cars found.
Page 519 done: 35 cars found.
Page 520 done: 35 cars found.
Page 521 done: 35 cars found.
Page 522 done: 35 cars found.
Page 523 done: 35 cars found.
Page 524 done: 35 cars found.
Page 525 done: 35 cars found.
Page 526 done: 35 cars found.
Page 527 done: 35 cars found.
Page 528 done: 35 cars found.
Page 529 done: 35 cars found.
Page 530 done: 35 cars found.
Page 531 done: 35 cars found.
Page 532 done: 35 cars found.
Page 533 done: 35 cars fo