### Part 1 - Web scraping
#### Team Volvo
#### Nadav Yedid - 208271007
#### Roee Brown - 208993006

In [65]:
import csv
from datetime import datetime
from bs4 import BeautifulSoup
import requests

In [66]:
car_dict_template = {
    "manufactor": "",
    "year": 0,
    "model": "",
    "hand": 0,
    "gear": "",
    "engine_capacity": 0,
    "engine_type": "",
    "prev_ownership": "",
    "curr_ownership": "",
    "area": "",
    "city": "",
    "price": 0.0,
    "pic_num": 0,
    "cre_date": datetime.now(),
    "repub_date": datetime.now(),
    "description": "",
    "color": "",
    "km": 0,
    "test_date": "",
    "test": 0,
    "supply_score": 0,
}

In [67]:
def get_html(url):
    raw_html = requests.get(url)
    return BeautifulSoup(raw_html.text, "html.parser")

In [68]:
 def get_links_and_filter(html, filter):
    links = []
    product_list = html.find_all("div", {"class": "product-list d-flex"})[0]
    for link in product_list.find_all("a"):
        if filter not in link.get("href"):
            continue
        links.append(link.get("href"))
    return links

In [69]:
def map_table(index):
    if index == "שנה":
        return "year", int
    elif index == "יד":
        return "hand", int
    elif index == "ת. הילוכים":
        return "gear", str
    elif index == "נפח":
        return "engine_capacity", int
    elif index == "סוג מנוע":
        return "engine_type", str
    elif index == 'ק"מ':
        return "km", int
    elif index == "טסט עד":
        return "test_date", str
    elif index == "צבע":
        return "color", str
    elif index == "בעלות קודמת":
        return "prev_ownership", str
    elif index == "בעלות נוכחית":
        return "curr_ownership", str
    elif index == "אזור":
        return "area", str
    elif index == "עיר":
        return "city", str
    else:
        return "error", str


In [70]:
def get_description(bs_data: BeautifulSoup):
    data = bs_data.find_all("p", {"class": "text-word-break"})
    if not data or len(data) == 0:
        return "No description"
    return data[0].get_text().strip()


In [71]:
def get_manufactor(bs_data: BeautifulSoup):
    breadcrumbs = bs_data.find_all("ol")[0]
    return breadcrumbs.find_all("li")[2].get_text().strip()



In [72]:
def get_model(bs_data: BeautifulSoup):
    breadcrumbs = bs_data.find_all("ol")[0]
    return breadcrumbs.find_all("li")[3].get_text().strip()

In [73]:
def get_price(bs_data: BeautifulSoup):
    data = bs_data.find_all("div", {"class": "d-flex justify-content-between"})[0]
    if len(data.find_all("h2")) < 2:
        return float(0)
    return float(
        data.find_all("h2")[1].get_text().strip().replace("₪", "").replace(",", ""))

In [74]:
def get_pic_num(bs_data: BeautifulSoup):
    data = bs_data.find_all(
        "div", {"class": "col-12 d-flex mt-3 justify-content-center flex-wrap"}
    )
    if not data or len(data) == 0:
        return 0
    return len(data[0].find_all("img"))

In [75]:
def get_days_2_test(test_date):
    datetime_object = datetime.strptime(test_date, "%m/%Y")
    today = datetime.now()
    return (datetime_object - today).days

In [76]:
def creation_date(bs_data: BeautifulSoup):
    data = bs_data.find_all(
        "div",
        {
            "class": "d-flex flex-row align-items-center justify-content-center flex-wrap"
        },
    )[0]
    date_string = data.find_all("div")[0].get_text().replace("תאריך יצירה:", "").strip()
    return datetime.strptime(date_string, "%d/%m/%Y")

In [77]:
def pop_date(bs_data: BeautifulSoup):
    data = bs_data.find_all(
        "div",
        {
            "class": "d-flex flex-row align-items-center justify-content-center flex-wrap"
        },
    )[0]
    date_string = (
        data.find_all("div")[1].get_text().replace("תאריך הקפצה אחרון:", "").strip()
    )
    return datetime.strptime(date_string, "%d/%m/%Y")


In [78]:
def get_score(manufactor, model, year):
    # Manufactors name from ad.co.il and gov data are not the same so we need to map them
    if manufactor == "וולוו":
        manufactor = "וולבו"

    url = "https://data.gov.il/api/3/action/datastore_search"
    data = requests.post(
        url,
        json={
            "resource_id": "5e87a7a1-2f6f-41c1-8aec-7216d52a6cf6",
            "q": "",
            "filters": {
                "tozar": manufactor,
                "kinuy_mishari": model,
                "shnat_yitzur": year,
            },
            "limit": 100,
            "offset": 0,
            "sort": "mispar_rechavim_le_pailim asc",
        },
    ).json()
    if data["success"] is False:
        print("Error in getting score")
        return 0
    return data["result"]["total"]

In [79]:
BASE_URL = "https://www.ad.co.il"
cars_url = "/car?sp261=13912"
car_links = get_links_and_filter(get_html(BASE_URL + cars_url), "ad/")

In [91]:
cars_data = []

for car_link in car_links:
    bs_data = get_html(BASE_URL + car_link)
    car_dict = car_dict_template.copy()
    car_dict["manufactor"] = get_manufactor(bs_data)
    car_dict["model"] = get_model(bs_data)
    car_dict["price"] = get_price(bs_data)
    car_dict["pic_num"] = get_pic_num(bs_data)
    car_dict["description"] = get_description(bs_data)
    car_dict["cre_date"] = creation_date(bs_data)
    car_dict["repub_date"] = pop_date(bs_data)

    table1 = bs_data.find_all("table", {"class": "table table-sm mb-4"})[0]
    items = table1.find_all("tr")
    for item_num in range(len(items)):
        item_row_bs = items[item_num].find_all("td")
        bs_key = item_row_bs[0].get_text().strip()
        key, type = map_table(bs_key)

        row_val = item_row_bs[1].get_text().strip()
        # if type is int remove commas
        if type is int:
            row_val = row_val.replace(",", "")

        car_dict[key] = type(row_val)
    if car_dict.get("test_date"):
        car_dict["test"] = get_days_2_test(car_dict["test_date"])
    car_dict["supply_score"] = get_score(
        car_dict["manufactor"], car_dict["model"], car_dict["year"]
    )
    del car_dict["test_date"]  # we dont need this field anymore

    cars_data.append(car_dict)
    print(car_dict)

Error in getting score
{'manufactor': 'וולוו', 'year': 2021, 'model': 'XC70', 'hand': 2, 'gear': 'אוטומטית', 'engine_capacity': 2000, 'engine_type': 'בנזין', 'prev_ownership': 'פרטית', 'curr_ownership': 'פרטית', 'area': '', 'city': 'בסמת טבעון', 'price': 160000.0, 'pic_num': 5, 'cre_date': datetime.datetime(2024, 2, 19, 0, 0), 'repub_date': datetime.datetime(2024, 2, 19, 0, 0), 'description': 'למכירה וולו Xc40 \nשנת 2021 \nנסע 42000', 'color': 'אפור', 'km': 24000, 'test': 0, 'supply_score': 0}
Error in getting score
{'manufactor': 'וולוו', 'year': 2015, 'model': 'V40', 'hand': 3, 'gear': 'אוטומטית', 'engine_capacity': 1600, 'engine_type': 'בנזין', 'prev_ownership': 'פרטית', 'curr_ownership': 'פרטית', 'area': 'פתח תקוה והסביבה', 'city': 'פתח תקווה', 'price': 47000.0, 'pic_num': 6, 'cre_date': datetime.datetime(2024, 1, 24, 0, 0), 'repub_date': datetime.datetime(2024, 1, 24, 0, 0), 'description': 'רכב שמור מטופל כהלכה\r\nרק 92000 קמ', 'color': 'אפור מטאלי', 'km': 92000, 'test': 59, 'supp

Error in getting score
{'manufactor': 'וולוו', 'year': 2005, 'model': 'S80', 'hand': 4, 'gear': 'אוטומטית', 'engine_capacity': 2000, 'engine_type': 'בנזין', 'prev_ownership': '', 'curr_ownership': '', 'area': 'קריות', 'city': 'קרית ביאליק', 'price': 3000.0, 'pic_num': 1, 'cre_date': datetime.datetime(2022, 9, 8, 0, 0), 'repub_date': datetime.datetime(2022, 9, 8, 0, 0), 'description': 'רכב במצב שמור מאד , למכירה בחלקים', 'color': '', 'km': 275000, 'test': 0, 'supply_score': 0}
Error in getting score
{'manufactor': 'וולוו', 'year': 2008, 'model': 'S60', 'hand': 2, 'gear': 'אוטומטית', 'engine_capacity': 2400, 'engine_type': 'בנזין', 'prev_ownership': '', 'curr_ownership': '', 'area': 'רמת השרון - הרצליה', 'city': 'הרצליה', 'price': 33500.0, 'pic_num': 1, 'cre_date': datetime.datetime(2022, 9, 6, 0, 0), 'repub_date': datetime.datetime(2022, 9, 6, 0, 0), 'description': 'למבינים ,, הרכב נרכש מהתצוגה בוולוו , נסעה סך הכל 133,000 ע"י אשתי בנהיגה עדינה ,,רכב נקי מתאונות מטופל במוסך שרות וולוו ב

Error in getting score
{'manufactor': 'וולוו', 'year': 2009, 'model': 'S60', 'hand': 2, 'gear': 'אוטומטית', 'engine_capacity': 2435, 'engine_type': 'בנזין', 'prev_ownership': '', 'curr_ownership': '', 'area': 'רמת השרון - הרצליה', 'city': 'רמת השרון', 'price': 21000.0, 'pic_num': 0, 'cre_date': datetime.datetime(2022, 7, 27, 0, 0), 'repub_date': datetime.datetime(2022, 7, 27, 0, 0), 'description': 'רכב מדהים...חיצוני , פנימי ומנוע במצב מצוין !', 'color': '', 'km': 205000, 'test': 0, 'supply_score': 0}
Error in getting score
{'manufactor': 'וולוו', 'year': 2020, 'model': 'S60', 'hand': 1, 'gear': 'אוטומטית', 'engine_capacity': 1996, 'engine_type': 'בנזין', 'prev_ownership': '', 'curr_ownership': '', 'area': 'רמת השרון - הרצליה', 'city': 'הרצליה', 'price': 215000.0, 'pic_num': 0, 'cre_date': datetime.datetime(2022, 7, 15, 0, 0), 'repub_date': datetime.datetime(2022, 7, 15, 0, 0), 'description': 'כמו חדש ,\r\nהכל מעור איבזור מלא', 'color': '', 'km': 17000, 'test': 0, 'supply_score': 0}


In [90]:
# export cars_data to csv
csv_file = "cars_data1.csv"
csv_columns = cars_data[0].keys()
try:
    with open(csv_file, "w", newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        for data in cars_data:
            writer.writerow(data)
except IOError:
    print("I/O error")

I/O error
