In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin
import time
import os
from openpyxl.utils.exceptions import IllegalCharacterError
from zipfile import BadZipFile

# Scraping cars data from Avito.ma

In [2]:
BASE_URL = "https://www.avito.ma"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

def clean_string(value):
    if pd.isna(value):
        return ""
    if isinstance(value, (int, float)):
        return value
    
    value = str(value)
    # Remove null bytes and control characters
    value = ''.join(char for char in value if 31 < ord(char) or char in '\t\n\r')
    # Replace other problematic characters with space
    value = value.replace('\x00', ' ').replace('\r', ' ').replace('\n', ' ')
    # Remove any remaining control characters
    value = ''.join(char for char in value if char.isprintable() or char in '\t\n\r')
    return value.strip()

def scrape_cars(start_page=1, end_page=1):
    all_cars = []
    
    for page in range(start_page, end_page + 1):
        url = f"{BASE_URL}/fr/maroc/voiture?o={page}" if page > 1 else f"{BASE_URL}/fr/maroc/voiture"
        
        try:
            response = requests.get(url, headers=HEADERS, timeout=30)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            script = soup.find('script', id='__NEXT_DATA__')
            
            if not script:
                print(f"No script tag found on page {page}")
                continue
                
            data = json.loads(script.string)
            cars = data["props"]["pageProps"]["initialReduxState"]["ad"]["search"]["ads"]
            
            for car in cars:
                params = {}
                for param in car.get('params', {}).get('secondary', []):
                    params[param.get('key')] = param.get('value', 'null')
                
                car_info = {
                    "id": clean_string(car.get("id", "null")),
                    "list_id": clean_string(car.get("listId", "null")),
                    "title": clean_string(car.get("subject", "null")),
                    "description": clean_string(car.get("description", "null")),
                    "ad_type": clean_string(car.get("adType", {}).get("label", "null")),
                    "category": clean_string(car.get("category", {}).get("formatted", "null")),
                    "price": clean_string(car.get("price", {}).get("value", "null")),
                    "currency": clean_string(car.get("price", {}).get("currency", "null")),
                    "location": clean_string(car.get("location", "null")),
                    "city": clean_string(car.get("location", "null").split(",")[0].strip() if car.get("location") else "null"),
                    "date_posted": clean_string(car.get("date", "null")),
                    "seller_name": clean_string(car.get("seller", {}).get("name", "null")),
                    "seller_type": clean_string(car.get("seller", {}).get("type", "null")),
                    "seller_verified": car.get("seller", {}).get("isVerifiedSeller", False),
                    "brand": clean_string(params.get("brand", "null")),
                    "model": clean_string(params.get("model", "null")),
                    "year": clean_string(params.get("regdate", "null")),
                    "mileage": clean_string(params.get("mileage", "null")),
                    "fuel_type": clean_string(params.get("fuel", "null")),
                    "transmission": clean_string(params.get("transmission", "null")),
                    "url": clean_string(urljoin(BASE_URL, car.get("href", ""))) if car.get("href") else "null",
                    "is_premium": car.get("isPremium", False),
                    "is_urgent": car.get("isUrgent", False),
                }
                
                all_cars.append(car_info)
            
            print(f"Page {page} done: {len(cars)} cars found")
            time.sleep(2)  
            
        except Exception as e:
            print(f"Error on page {page}: {e}")
    
    return all_cars

# Scrape data
cars_data = scrape_cars(start_page=501, end_page=700)

# Save to JSON first
json_file = "avito_voitures_2024.json"
with open(json_file, 'w', encoding='utf-8') as f:
    json.dump(cars_data, f, ensure_ascii=False, indent=2)
print(f"Data saved to {json_file}")

# Load from JSON and clean
df_from_json = pd.read_json(json_file)

# Apply cleaning to all string columns
for col in df_from_json.select_dtypes(include=['object']).columns:
    df_from_json[col] = df_from_json[col].apply(clean_string)

output_file = "avito_cars.xlsx"

try:
    # Try to save to Excel first
    df_from_json.to_excel(output_file, index=False, engine='openpyxl')
    print(f"Successfully saved {len(df_from_json)} cars to Excel!")
    
except IllegalCharacterError:
    print("Encountered illegal characters - trying alternative cleaning method")
    # More aggressive cleaning for Excel
    for col in df_from_json.select_dtypes(include=['object']).columns:
        df_from_json[col] = df_from_json[col].str.encode('ascii', 'ignore').str.decode('ascii')
    
    try:
        df_from_json.to_excel(output_file, index=False, engine='openpyxl')
        print(f"Successfully saved {len(df_from_json)} cars to Excel after aggressive cleaning!")
    except Exception as e:
        print(f"Failed to save Excel file after cleaning: {e}")
        csv_file = "avito_cars_failed.csv"
        df_from_json.to_csv(csv_file, index=False, encoding='utf-8-sig')
        print(f"Saved data to {csv_file} instead")

except Exception as e:
    print(f"Failed to save Excel file: {e}")
    csv_file = "avito_cars_failed.csv"
    df_from_json.to_csv(csv_file, index=False, encoding='utf-8-sig')
    print(f"Saved data to {csv_file} instead")

Page 501 done: 35 cars found


KeyboardInterrupt: 