# AckoDrive Scraper (Visual Educational Version)

This is the **Refined Reference Notebook**.
The code structure has been optimized for **Readability** to clearly distinguish between 'Comments' and 'Code'.

### Key Improvements:
1.  **Broken-Down F-Strings**: Instead of one giant text block, the HTML is built line-by-line.
2.  **Syntax Highlighting Friendly**: Comments are now real Python comments (`#`), so your editor will color them differently from the string text.

In [None]:
# --- STEP 1: IMPORTING LIBRARIES ---

# requests: Using this allows us to send HTTP requests to AckoDrive's server,
# pretending to be a normal web browser. It fetches the raw data (JSON).
import requests

# BeautifulSoup: This is our 'Parser'. It takes raw HTML/XML text and turns it
# into a tree of objects (tags) that we can easily search (e.g., "find all divs with class 'card'").
from bs4 import BeautifulSoup

# pandas: The industry standard for Data Analysis in Python.
# We use it here to organize our scraped data into a Table (DataFrame) and save it as CSV.
import pandas as pd

# time: We use 'time.sleep()' to pause the script.
# THEORY: If we hit the server too fast (e.g., 100 requests/second), they might ban our IP.
# Adding a small delay mimics human behavior.
import time

# tqdm: Displays a progress bar. Helpful when processing lots of States/Cities.
from tqdm.notebook import tqdm

import os

print("Libraries Imported Successfully.")

In [None]:
# --- STEP 2: CITY DATA MANAGEMENT ---

# THEORY: Instead of hardcoding cities like ['Delhi', 'Mumbai'], which might change,
# we fetch the 'Master List' from AckoDrive. This ensures our scraper is always up-to-date.
# If they add a new city next week, this function will automatically find it.
def fetch_master_city_list():
    try: 
        # We hit the 'city-list' endpoint which their frontend uses to populate the dropdown.
        return requests.get("https://workspace.ackodrive.com/workspace/dealer/city-list/").json()
    except: 
        return []

# We group cities by State because we want to save our data cleanly:
# One CSV per State (e.g., 'acko_cars_Karnataka.csv') containing all its cities.
def group_cities_by_state(cities_data):
    state_map = {}
    for item in cities_data:
        # We check nested keys carefully to avoid crashing if data is missing.
        if 'city_config' in item and 'state' in item['city_config']:
            state_name = item['city_config']['state'].get('name')
            city = item.get('name')
            if state_name and city:
                if state_name not in state_map: 
                    state_map[state_name] = []
                state_map[state_name].append(city)
    return state_map

all_cities_data = fetch_master_city_list()
state_city_map = group_cities_by_state(all_cities_data)
print(f"Loaded {len(state_city_map)} States.")

In [None]:
# --- STEP 3: THE 'DOM SIMULATOR' (VISUALLY ENHANCED) ---

def get_acko_page_html_simulated(city, page, body_type, headers):
    url = "https://ackodrive-catalog-service.ackodrive.com/rest/v1/search/cars"
    params = {"city": city, "page": page, "size": 24}
    if body_type != "Other":
        params["bodyTypes"] = body_type
    
    try:
        resp = requests.get(url, params=params, headers=headers)
        if resp.status_code != 200: return "<html></html>", 0
        
        data = resp.json()
        result = data.get('result', [])

        html = "<html><body>"
        for car in result:
            # [Status Logic] (Unchanged logic, just hiding complexity here)
            status_text = ""
            status_raw = car.get('status')
            waiting = car.get('min_waiting_period')
            if status_raw == 'out_of_production': status_text = "Discontinued"
            elif status_raw != 'sellable' and status_raw != 'out_of_production': status_text = "Unavailable on ACKO Drive"
            elif waiting is not None or car.get('is_express_delivery'): status_text = "Express Delivery"
            status_html = f'<div class="BuyCarCard_featureTag__Status">{status_text}</div>' if status_text else ""
            
            # [Badge Logic] (Same as before)
            badges_html = ""
            trust_markers = car.get('trust_markers')
            if trust_markers and 'dynamic_trustmarker_tags' in trust_markers:
                tags = trust_markers.get('dynamic_trustmarker_tags')
                if tags:
                    for t in tags:
                        name = t.get('tag_name')
                        if name in ['Best Seller', 'Newly Launched']:
                            badges_html += f'<div class="CollectionTrustMarkerBadge_wrapper"><span class="CollectionTrustMarkerBadge_label__wNGdO">{name}</span></div>'

            # --- THE "CARD" HTML CONSTRUCTION (REFACTORED FOR READABILITY) ---
            # We split the big f-string into multiple lines joined by parentheses ().
            # This lets us put REAL PYTHON COMMENTS (#) in between the lines.
            # Your editor should now color the comments (Grey/Green) differently from the HTML string (Orange/Red).
            
            html += (
                # 1. THE CONTAINER DIV
                # We give this a specific class "BuyCarCard_cardContainer". 
                # LATER, in the scraper loop, we say `soup.find_all('div', class_='BuyCarCard_cardContainer')` to find this.
                f'<div class="BuyCarCard_cardContainer">'
                
                # 2. INSERTING PRE-CALCULATED HTML
                # We insert the Status and Badge HTML we created above.
                f'    {status_html}'
                f'    {badges_html}'
                
                # 3. INSERTING TEXT DATA
                # Using {{car.get(...)}} to pull text directly from JSON into the HTML.
                f'    <div class="BuyCarCard_carName__SAJVh">{car.get("brand_name")} {car.get("model_name")}</div>'
                f'    <div class="BuyCarCard_carVariants__uju0j">{car.get("variant_count")} Variants</div>'
                
                # 4. THE SPECIFICATIONS BLOCK
                f'    <div class="BuyCarCard_contentSpecifications__4ELYd">'
                # Using .join() on lists to make clean strings (e.g. "Petrol Diesel").
                f'        <p class="BuyCarCard_specificationItem__yn5cu">{" ".join(car.get("fuel_types", []))}</p>'
                f'        <p class="BuyCarCard_specificationItem__yn5cu">{" ".join(car.get("transmission_types", []))}</p>'
                
                # 5. CONDITIONAL BODY TYPE LOGIC
                # If Body Type is "Other", we print "Other". Else we print the specific type (e.g. "Sedan").
                f'        <p class="BuyCarCard_specificationItem__yn5cu">{body_type if body_type != "Other" else "Other"}</p>'
                
                # 6. HIDDEN DATA INJECTION (SECRET SAUCE)
                # We convert Prices to Integers here and hide them in the HTML.
                # This makes it super easy to extract clean numbers later.
                f'        <p class="HIDDEN_MIN_PRICE">{int(car.get("min_price")) if car.get("min_price") else 0}</p>'
                f'        <p class="HIDDEN_MAX_PRICE">{int(car.get("max_price")) if car.get("max_price") else 0}</p>'
                f'    </div>'
                
                f'    <div class="BuyCarCard_ratingVal__4zoYr">{car.get("experts_rating", "N/A")}</div>'
                f'</div>'
            )
            
        html += "</body></html>"
        return html, len(result)
    except:
        return "<html></html>", 0


In [None]:
# --- STEP 4: MAIN EXECUTION & LOOP ARCHITECTURE ---
print("--- Starting Full Scraper ---")
headers = {"User-Agent": "Mozilla/5.0"}

# 1. DEFINE BODY TYPES: 
BODY_TYPES = ["Sedan", "SUV", "MPV", "Hatchback", "Hyper car", "Luxury Sedan", "Luxury SUV", "Luxury MPV", "Sports car"]

# OUTER LOOP 1: STATES (Filesystem Hierarchy)
for state, cities in tqdm(state_city_map.items(), desc="Processing States"):
    state_cars = [] # This list will hold data for *this* state only.
    
    # INNER LOOP 2: CITIES
    for city in cities:
        # DEDUPLICATION SET: Ensures unique cars per city pass.
        seen_cars = set()
        
        # PREPARING THE ITERATION LIST -- THE "CATCH-ALL" STRATEGY
        # [Sedan, SUV...] + [Other (Catch-All)]
        params_list = BODY_TYPES + ["Other"]
        
        # INNER LOOP 3: QUERY TYPES
        for b_type in params_list:
            
            # INNER LOOP 4: PAGINATION (Max 20 pages)
            for page in range(1, 20):
                
                # CALLING THE HELPER (Step 3)
                html, count = get_acko_page_html_simulated(city, page, b_type, headers)
                
                # BREAK CONDITION 1: Empty Page. End of Category.
                if count == 0: break
                
                # PARSING
                soup = BeautifulSoup(html, 'html.parser')
                cards = soup.find_all('div', attrs={'class': 'BuyCarCard_cardContainer'})
                
                # INNER LOOP 5: CARDS (Extraction Logic)
                for card in cards:
                    name_tag = card.find('div', class_='BuyCarCard_carName__SAJVh')
                    car_name = name_tag.text if name_tag else "Unknown"
                    
                    # DUPLICATE LOGIC APPLIED
                    if car_name in seen_cars: continue
                    seen_cars.add(car_name)
                    
                    # EXTRACTING BASIC DATA
                    rating = card.find('div', class_='BuyCarCard_ratingVal__4zoYr').text if card.find('div', class_='BuyCarCard_ratingVal__4zoYr') else "N/A"
                    variants = card.find('div', class_='BuyCarCard_carVariants__uju0j').text if card.find('div', class_='BuyCarCard_carVariants__uju0j') else ""
                    
                    # EXTRACTING STATUS (From our injected Div)
                    status_node = card.find('div', class_='BuyCarCard_featureTag__Status')
                    status = status_node.text if status_node else "None"
                    
                    # EXTRACTING BADGES (From our injected Spans)
                    is_bestseller = "No"
                    is_newly = "No"
                    badges = card.find_all('span', class_='CollectionTrustMarkerBadge_label__wNGdO')
                    for b in badges:
                        if b.text == "Best Seller": is_bestseller = "Yes"
                        if b.text == "Newly Launched": is_newly = "Yes"
                        
                    # EXTRACTING HIDDEN PRICES
                    # Here the hidden classes we created in Step 3 pay off.
                    min_p = card.find('p', class_='HIDDEN_MIN_PRICE').text
                    max_p = card.find('p', class_='HIDDEN_MAX_PRICE').text
                    
                    specs = card.find_all('p', class_='BuyCarCard_specificationItem__yn5cu')
                    fuel = specs[0].text if len(specs)>0 else ""
                    trans = specs[1].text if len(specs)>1 else ""
                    
                    # SMART BODY TYPE LOGIC
                    body = b_type
                    if body == "Other" and len(specs)>2:
                        body = specs[2].text
                    
                    item = {
                        'State': state, 'City': city, 'Car Name': car_name,
                        'Min Price': min_p, 'Max Price': max_p,
                        'Rating': rating, 'Variants': variants,
                        'Fuel': fuel, 'Transmission': trans, 'Body Type': body,
                        'Car Status': status, 'Best Seller': is_bestseller, 'Newly Launched': is_newly
                    }
                    state_cars.append(item)
                
                # BREAK CONDITION 2: Last Page (Count < 24)
                if count < 24: break
                time.sleep(0.05)
    
    if state_cars:
        df = pd.DataFrame(state_cars)
        safe_state = "".join([c for c in state if c.isalpha() or c.isdigit() or c==' ']).strip().replace(' ', '_')
        filename = f"acko_cars_{safe_state}.csv"
        df.to_csv(filename, index=False)
        print(f"Saved {filename}")