In [None]:
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin
import pandas as pd
import time
import os

class AvitoScraper:
    BASE_URL = "https://www.avito.ma"
    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }
    
    def __init__(self, output_file="avito_cars.xlsx"):
        self.output_file = output_file
        
    def _get_page_url(self, page_number):
        return f"{self.BASE_URL}/fr/maroc/voiture?o={page_number}" if page_number > 1 else f"{self.BASE_URL}/fr/maroc/voiture"
    
    def _extract_car_data(self, car):
        params = {}
        for param in car.get('params', {}).get('secondary', []):
            params[param.get('key')] = param.get('value', 'null')
        
        location = car.get("location", "null")
        
        return {
            "id": car.get("id", "null"),
            "list_id": car.get("listId", "null"),
            "title": car.get("subject", "null"),
            "description": car.get("description", "null"),
            "ad_type": car.get("adType", {}).get("label", "null"),
            "category": car.get("category", {}).get("formatted", "null"),
            "price": car.get("price", {}).get("value", "null"),
            "currency": car.get("price", {}).get("currency", "null"),
            "location": location,
            "city": location.split(",")[0].strip() if location != "null" else "null",
            "date_posted": car.get("date", "null"),
            "seller_name": car.get("seller", {}).get("name", "null"),
            "seller_type": car.get("seller", {}).get("type", "null"),
            "seller_verified": car.get("seller", {}).get("isVerifiedSeller", False),
            "brand": params.get("brand", "null"),
            "model": params.get("model", "null"),
            "year": params.get("regdate", "null"),
            "mileage": params.get("mileage", "null"),
            "fuel_type": params.get("fuel", "null"),
        }
    
    def _process_page(self, page_number):
        url = self._get_page_url(page_number)
        try:
            response = requests.get(url, headers=self.HEADERS)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            script = soup.find('script', id='__NEXT_DATA__')
            
            if not script:
                print(f"No script tag found on page {page_number}")
                return []
                
            data = json.loads(script.string)
            cars = data["props"]["pageProps"]["initialReduxState"]["adù*"]["search"]["ads"]
            return [self._extract_car_data(car) for car in cars]
            
        except Exception as e:
            print(f"Error on page {page_number}: {str(e)}")
            return []
    
    def scrape(self, start_page=1, end_page=1):
        all_cars = []
        
        for page in range(start_page, end_page + 1):
            page_cars = self._process_page(page)
            all_cars.extend(page_cars)
            print(f"Page {page} done: {len(page_cars)} cars found")
            time.sleep(2)
        
        self._save_to_excel(all_cars)
        return all_cars
    
    def _save_to_excel(self, new_data):
        if os.path.exists(self.output_file):
            existing_data = pd.read_excel(self.output_file)
            all_data = pd.concat([existing_data, pd.DataFrame(new_data)], ignore_index=True)
        else:
            all_data = pd.DataFrame(new_data)
        
        all_data.to_excel(self.output_file, index=False)
        print(f"Saved {len(all_data)} cars to Excel (added {len(new_data)} new records)!")


# Usage
scraper = AvitoScraper()
cars_data = scraper.scrape(start_page=1, end_page=200)