שיראל אלימי 318968369

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [None]:
def get_car_links(base_url, max_pages):
    car_links = []

    for page_num in range(1, max_pages + 1):
        url = f"{base_url}?pageindex={page_num}"
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        cards_wrap = soup.find('div', class_='cards-wrap')

        if cards_wrap:
            card_blocks = cards_wrap.find_all('div', class_='card-block')
            for card in card_blocks:
                h2_element = card.find('h2', class_='card-title')
                if h2_element:
                    car_name = h2_element.text.strip()

                     # Only URLs that contains data on Mazda
                    if 'מאזדה' in car_name:
                        link_element = card.find('a', href=True)
                        if link_element:
                            link = link_element.get('href')
                            if link and '/ad/' in link:
                                car_links.append('https://www.ad.co.il' + link)

    return car_links

base_url = 'https://www.ad.co.il/car'
car_links = get_car_links(base_url, max_pages=15)  # Choose the amount of data we want to extract (Each page has 48 Ads)

# Debugging
print(len(car_links))

45


In [None]:
def scrape_car_details(car_url):
    response = requests.get(car_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    table = soup.find('table', class_='table table-sm mb-4')
    if not table:
        print(f"No table found on page: {car_url}")
        return None

    details = {}

    card_titles = soup.find_all('h2', class_='card-title')
    if len(card_titles) >= 2:
        car_name = card_titles[0].text.strip()
        price = card_titles[1].text.strip()
        details['דגם'] = car_name
        details['מחיר'] = price

        date_divs = soup.select('div.d-flex.flex-row.align-items-center.justify-content-center.flex-wrap > div.px-3')
        date_pattern = re.compile(r'\d{2}/\d{2}/\d{4}')

        if len(date_divs) >= 2:
            creation_date_match = date_pattern.search(date_divs[0].text.strip())
            last_updated_date_match = date_pattern.search(date_divs[1].text.strip())

            if creation_date_match:
                creation_date = creation_date_match.group()
                details['תאריך_יצירה'] = creation_date

            if last_updated_date_match:
                last_updated_date = last_updated_date_match.group()
                details['תאריך_הקפצה'] = last_updated_date

        parent_div = soup.find('div', class_='col-12 d-flex mt-3 justify-content-center flex-wrap')
        if parent_div:
            child_divs = parent_div.find_all('div', class_='justify-content-center')
            count = len(child_divs)
        else:
            count = 0

        details['כמות_תמונות'] = count

        rows = table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            if len(cols) == 2:
                label = cols[0].text.strip()
                value = cols[1].text.strip()
                details[label] = value


    return details

all_car_details = []


for car_url in car_links:
    car_details = scrape_car_details(car_url)
    if car_details:
        all_car_details.append(car_details)

car_df = pd.DataFrame(all_car_details)
print(car_df.shape)
car_df.head()


(45, 17)


Unnamed: 0,דגם,מחיר,תאריך_יצירה,תאריך_הקפצה,כמות_תמונות,שנה,יד,ת. הילוכים,נפח,סוג מנוע,"ק""מ",טסט עד,צבע,בעלות קודמת,בעלות נוכחית,אזור,עיר
0,מאזדה 3,"8,000 ₪",05/06/2024,05/06/2024,3,2007,4,אוטומטית,1600,בנזין,189000,10/2024,אפור,פרטית,פרטית,באר שבע והסביבה,אופקים
1,מאזדה 6,"16,000 ₪",16/05/2024,04/06/2024,4,2008,3,אוטומטית,2000,בנזין,195000,10/2024,שחור,פרטית,פרטית,בית שמש והסביבה,בית שמש
2,מאזדה 3,"43,000 ₪",13/05/2024,21/05/2024,8,2015,4,אוטומטית,1500,בנזין,152000,,שחור,פרטית,פרטית,עכו - נהריה,נהריה
3,מאזדה 3,"133,000 ₪",19/05/2024,28/05/2024,1,2022,2,אוטומטית,2000,בנזין,30000,04/2025,שחור,פרטית,פרטית,נתניה והסביבה,כפר יונה
4,מאזדה 3,"29,900 ₪",06/04/2024,07/04/2024,5,2012,3,אוטומטית,2000,בנזין,209,08/2024,שחור,פרטית,פרטית,קריות,קרית אתא


In [None]:
car_df.to_csv('car_Mazda.csv', index=False)