In [1]:
import requests
import pandas as pd
import random
import json

# --------------------------
# Fake User Agents
# --------------------------
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
]

def get_headers():
    return {"User-Agent": random.choice(USER_AGENTS)}

# --------------------------
# Persian digits → English digits
# --------------------------
def convert_persian_digits(text):
    if not text or text == "N/A":
        return "N/A"
    persian_to_english = str.maketrans("۰۱۲۳۴۵۶۷۸۹", "0123456789")
    return str(text).translate(persian_to_english)

# --------------------------
# Extract ad info from API JSON response
# --------------------------
def extract_ad_from_api(ad_data):
    """Extract ad information from API JSON response"""
    detail = ad_data.get("detail", {})
    price_data = ad_data.get("price", {})
    specs = ad_data.get("specs", {})
    
    # Helper function to safely get value or return N/A
    def safe_get(value, default="N/A"):
        if value is None or value == "":
            return default
        return value
    
    # Extract basic info
    code = safe_get(detail.get("code"))
    title = safe_get(detail.get("title"))
    year = safe_get(detail.get("year"))
    mileage = safe_get(detail.get("mileage"))
    location = safe_get(detail.get("location"))
    transmission = safe_get(detail.get("transmission"))
    fuel = safe_get(detail.get("fuel"))
    color = safe_get(detail.get("body_color"))
    inside_color = safe_get(detail.get("inside_color"))
    url_path = safe_get(detail.get("url"))
    
    # Build full URL
    if url_path != "N/A" and url_path:
        full_link = f"https://bama.ir{url_path}"
    else:
        full_link = "N/A"
    
    # Extract price
    price = price_data.get("price")
    if price is None or price == "":
        price = "N/A"
    else:
        price = str(price)
    
    # Extract description if available
    description = detail.get("description")
    description = safe_get(description)
    
    # Convert Persian digits to English
    year = convert_persian_digits(year)
    mileage = convert_persian_digits(mileage)
    
    return {
        "code": code,
        "title": title,
        "production_year": year,
        "Mileage": mileage,
        "Color": color,
        "inside_color": inside_color,
        "Transmission": transmission,
        "Fuel": fuel,
        "price": price,
        "location": location,
        "Description": description,
        "link": full_link,
    }

# --------------------------
# MAIN SCRAPER - Using API
# --------------------------
def scrape_bama(target_ads=50):
    """
    Scrape bama.ir using their API endpoint
    Args:
        target_ads: Number of ads to collect (default: 50)
    """
    api_base_url = "https://bama.ir/cad/api/search"
    ads_final = []
    seen_codes = set()  # Track unique ad codes to avoid duplicates
    
    print(f"\n--- Scraping API for {target_ads} ads ---")
    session = requests.Session()
    
    page = 1
    total_pages = None
    
    while len(ads_final) < target_ads:
        # Build API URL with parameters
        params = {
            "yearFrom": "1385-2006",
            "yearTo": "",
            "vehicle": "samand",
            "pageIndex": page
        }
        
        url = f"{api_base_url}?yearFrom={params['yearFrom']}&yearTo={params['yearTo']}&vehicle={params['vehicle']}&pageIndex={params['pageIndex']}"
        print(f"Page {page}: {url}")
        
        try:
            r = session.get(api_base_url, params=params, headers=get_headers(), timeout=10)
            r.raise_for_status()
            
            # Parse JSON response
            data = r.json()
            
            # Check if API call was successful
            if not data.get("status", False):
                print(f"API returned error status. Stopping.")
                break
            
            # Get metadata
            metadata = data.get("metadata", {})
            total_pages = metadata.get("total_pages", 0)
            current_page = metadata.get("current_page", 0)
            has_next = metadata.get("has_next", False)
            total_count = metadata.get("total_count", 0)
            
            if page == 1:
                print(f"API reports {total_count} total ads across {total_pages} pages")
            
            # Get ads from response
            ads_data = data.get("data", {}).get("ads", [])
            
            if not ads_data:
                print("No more ads found. Stopping.")
                break
            
            print(f"Found {len(ads_data)} ads on page {page}")
            
            # Extract info from each ad, avoiding duplicates
            new_ads_count = 0
            for ad_data in ads_data:
                if len(ads_final) >= target_ads:
                    break
                
                # Get ad code to check for duplicates
                ad_code = ad_data.get("detail", {}).get("code", "")
                
                # Skip if we've seen this ad before
                if ad_code and ad_code in seen_codes:
                    continue
                
                ad_info = extract_ad_from_api(ad_data)
                ads_final.append(ad_info)
                seen_codes.add(ad_code)
                new_ads_count += 1
            
            print(f"Added {new_ads_count} new ads. Total unique ads collected: {len(ads_final)}")
            
            # Check if we should continue
            if not has_next or page >= total_pages:
                print("Reached last page or no more pages available.")
                break
            
            # If we didn't get any new ads, stop to avoid infinite loop
            if new_ads_count == 0:
                print("No new ads found on this page. Stopping.")
                break
            
            page += 1
            
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}. Stopping.")
            break
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON: {e}. Stopping.")
            break
        except Exception as e:
            print(f"Unexpected error: {e}. Stopping.")
            break
    
    print(f"\nTotal unique ads collected: {len(ads_final)}")
    return ads_final

# --------------------------
# EXECUTE SCRAPER
# --------------------------
ads = scrape_bama(target_ads=50)

df = pd.DataFrame(ads)

# Define column order
ordered_cols = [
    "code", "title", "production_year", "Mileage", "Color", "inside_color", 
    "Transmission", "Fuel", "price", "location", "Description", "link"
]

# Reindex to ensure all columns are present
df = df.reindex(columns=ordered_cols)

print("\n--- FINAL DATAFRAME INFO ---")
print(f"Total rows: {len(df)}")
print(f"Total columns: {len(df.columns)}")
print("\n--- FIRST 10 ROWS ---")
print(df.head(10))
print("\n--- DATAFRAME SUMMARY ---")
print(df.info())



--- Scraping API for 50 ads ---
Page 1: https://bama.ir/cad/api/search?yearFrom=1385-2006&yearTo=&vehicle=samand&pageIndex=1
API reports 41 total ads across 3 pages
Found 30 ads on page 1
Added 30 new ads. Total unique ads collected: 30
Page 2: https://bama.ir/cad/api/search?yearFrom=1385-2006&yearTo=&vehicle=samand&pageIndex=2
Found 30 ads on page 2
Added 20 new ads. Total unique ads collected: 50

Total unique ads collected: 50

--- FINAL DATAFRAME INFO ---
Total rows: 50
Total columns: 12

--- FIRST 10 ROWS ---
       code       title production_year      Mileage    Color inside_color  \
0  oqjeairs    سمند، LX            1385   240,000 km  نقره ای      خاکستری   
1  2tubbohx    سمند، LX            1394   230,000 km     سفید          کرم   
2  cvwihfxt  سمند، سورن            1404  صفر کیلومتر     سفید         مشکی   
3  1srb7g5i    سمند، LX            1396   249,000 km     سفید      قهوه ای   
4  svk7lnw5  سمند، سورن            1404  صفر کیلومتر     مشکی      خاکستری   
5  azhr3kmf

In [3]:
excel_filename = "bama_samand_ads.xlsx"
df.to_excel(excel_filename, index=False, engine='openpyxl')