# FinalProject_DataMining-py
**oriya shapira **
- The data set contains information about Subaro cars posted in the years 1980-2024 (the entire database).
- link to the project page in GitHub - https://github.com/OriyaShapira/FinalProject_DataMining.git

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import calendar
import re
from concurrent.futures import ThreadPoolExecutor

In [2]:
def gat_all_pages(keywords):
    #This function returns a list of URLs to all the pages on the ad.co.il website that contain ads with the specified keyword.
    pages_url_list = []
    try:
        url = "https://www.ad.co.il/car?keyword=" + keywords
        pages_url_list.append(url)
        response = requests.get(url)
        if not response.status_code == 200:
            print (f"Could not fetch data from website. Got status code: {response.status_code}")
            return []
        results_page = BeautifulSoup(response.content, 'html.parser')
        pages = results_page.find_all('li',{'class':'page-item mx-1 d-none d-sm-block'})
        for page in pages:
            page_url = "https://www.ad.co.il" + page.find('a').get('href')
            pages_url_list.append(page_url)
    except Exception as e:
        print(f"Could not find links for {keywords} pages. Got error: {str(e)}")
        return []
    return pages_url_list

In [3]:
def get_ad_url_and_price(url_list):
    #This function returns a list of URLs to specific ads containing the keyword and the price of the object being published.
    ads_list = []
    try:
        for url in url_list:
            response = requests.get(url)
            if not response.status_code == 200:
                print (f"Could not fetch data from website. Got status code: {response.status_code}")
                return []
            results_page = BeautifulSoup(response.content, 'html.parser')
            car_ad = results_page.find_all('div',{'class':'card-body p-md-3'})
            for detail in car_ad:
                car_ad_url = "https://www.ad.co.il" + detail.find('a').get('href')
                try:
                    Price = float(detail.find('div',{'class':'price ms-1'}).text.strip().replace(",", "").replace("₪", ""))
                    ads_list.append((car_ad_url,Price))
                except Exception as e:
                    continue
        return ads_list
    except Exception as e:
        print(f"Could not get links for {keywords}. Got error: {str(e)}")
        return []

In [4]:
def get_sys_dates(results_page):
    # Given an ad URL, this function returns the dates that the ad was published and reupdated.
    car_dict = {}
    sys_date = results_page.find('div',{'class':'d-flex flex-row align-items-center justify-content-center flex-wrap'}).find_all('div',{'class':'px-3'})
    for date in sys_date:
        if 'תאריך יצירה:' or 'תאריך הקפצה אחרון:' in date:
            date = date.text.strip()
            key, date_str = date.split(":")
            value = datetime.datetime.strptime(date_str[1:], "%d/%m/%Y")
            car_dict[key] = value
    return car_dict

In [5]:
def get_car_details(car_link):
    # Given an ad URL, this function returns all the details posted in the ad
    response = requests.get(car_link)
    if not response.status_code == 200:
        print('none')
    car_dict = {}
    results_page = BeautifulSoup(response.content, 'html.parser')
    car_h2 = results_page.find_all('h2')
    split_header = car_h2[0].get_text(strip = True).split(' ')
    car_dict['manufactor'] = split_header[0]
    car_dict['model'] = split_header[0]+' '+split_header[1] if len(split_header) >= 2 else split_header[0]
    #car_dict['Price'] = float(car_h2[1].get_text(strip = True).replace(",", "").replace("₪", ""))
    car_dict['Pic_num'] = len(results_page.find_all('img'))
    try:
        description = results_page.find('p',{'class':'text-word-break'}).get_text(strip = True)
    except AttributeError:
        description = ""
    car_dict['Description'] = description
    table = results_page.find('table',{'class':'table table-sm mb-4'})
    if table:
        for tr in table.find_all('tr'):
            cells = tr.find_all('td')
            key, value = [cell.get_text(strip = True).replace(",","") for cell in cells]
            car_dict[key] = value
    car_dict.update(get_sys_dates(results_page))
    return car_dict

In [6]:
def get_test_date_in_days(date_str):
    # This function gets the car test date and returns the number of days from the current day to that date.
    if not date_str == None:
        date_obj = datetime.datetime.strptime(date_str, "%m/%Y")
        year = date_obj.year
        month = date_obj.month
        last_day_of_month = datetime.datetime(year, month, calendar.monthrange(year, month)[1])
        delta = (last_day_of_month - datetime.datetime.now()).days
        return delta

In [7]:
def parse_car_details(car_link_and_price):
    # Constructing a dictionary for each ad
    cars_dict = {}
    car_details = get_car_details(car_link_and_price[0])
    cars_dict['Manufactor'] = car_details.get('manufactor',None)
    cars_dict['Year'] = car_details.get('שנה',None)
    cars_dict['Model'] = car_details.get('model',None)
    cars_dict['Hand'] = car_details.get('יד',None)
    cars_dict['Gear'] = car_details.get('ת. הילוכים',None)
    cars_dict['Engine_capacity'] = car_details.get('נפח',None)
    cars_dict['Engine_type'] = car_details.get('סוג מנוע',None)
    cars_dict['Prev_ownership'] = car_details.get('בעלות קודמת',None)
    cars_dict['Curr_ownership'] = car_details.get('בעלות נוכחית',None)
    cars_dict['Area'] = car_details.get('אזור',None)
    cars_dict['City'] = car_details.get('עיר',None)
    cars_dict['Price'] = car_link_and_price[1]
    #cars_dict['Price'] = car_details.get('Price',None)
    cars_dict['Pic_num'] = car_details.get('Pic_num',None)
    cars_dict['Cre_date'] = car_details.get('תאריך יצירה',None)
    cars_dict['Repub_date'] = car_details.get('תאריך הקפצה אחרון',None)
    cars_dict['Description'] = car_details.get('Description',None)
    cars_dict['Color'] = car_details.get('צבע',None)
    cars_dict['Km'] = car_details.get('ק"מ',None)
    cars_dict['Test'] = get_test_date_in_days(car_details.get('טסט עד',None))
    return cars_dict

In [8]:
keywords = 'סובארו'
all_pages_links = gat_all_pages(keywords)
all_car_links = get_ad_url_and_price(all_pages_links)

with ThreadPoolExecutor(max_workers=None) as executor:  # Use all available CPU cores
    results = executor.map(parse_car_details, all_car_links)  # Parallelize the loop
    
cars_dict = pd.DataFrame(results)

# Assigning types to columns
categoricals = ['Prev_ownership', 'Curr_ownership', 'Engine_type', 'Gear']
strings = ['Area','City','Color','Description','Manufactor','Model']
integers = ['Year','Hand', 'Engine_capacity', 'Pic_num', 'Km', 'Test']
cars_dict[categoricals] = cars_dict[categoricals].astype('category')
cars_dict[strings] = cars_dict[strings].astype(pd.StringDtype())
cars_dict[integers] = cars_dict[integers].astype('Int64')

cars_dict.to_csv('cars_dict.csv', index=False, encoding='utf-8-sig')
display(cars_dict)

Unnamed: 0,Manufactor,Year,Model,Hand,Gear,Engine_capacity,Engine_type,Prev_ownership,Curr_ownership,Area,City,Price,Pic_num,Cre_date,Repub_date,Description,Color,Km,Test
0,סובארו,2009,סובארו B3,4,אוטומטית,1500,בנזין,פרטית,פרטית,בית שמש והסביבה,בית שמש,11000.0,13,2024-05-12,2024-05-12,רכב שמור קילומטראז נמוך נעילה מרכזית מערכת שמע...,שחור,155000,238
1,סובארו,2014,סובארו אימפרזה,3,אוטומטית,1600,בנזין,פרטית,פרטית,באר שבע והסביבה,באר שבע,25000.0,25,2023-11-30,2024-04-25,סובארו אימפרזה מודל 2014 יד 3 השכרה לשעבר 187...,לבן,187000,176
2,סובארו,2015,סובארו B4,2,אוטומטית,2500,בנזין,פרטית,פרטית,,טייבה,35000.0,22,2024-04-04,2024-04-03,סובארו ב4 2015 דגם פרימיום החדש פנסי xenon 4...,לבן,251000,85
3,סובארו,2012,סובארו B4,3,אוטומטית,5500,בנזין,פרטית,פרטית,ירושלים והסביבה,מעלה אדומים,24.0,19,2024-02-03,2024-02-03,,אפור,140,
4,סובארו,2009,סובארו אימפרזה,3,ידנית,2400,בנזין,פרטית,פרטית,,ירכא,95000.0,16,2022-08-04,2023-08-03,Sti שחור מאט ללא תאונות שמורה מאד. יד שלישית ט...,שחור,125000,85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,סובארו,2016,סובארו XV,3,אוטומטית,1600,בנזין,,,חדרה וישובי עמק חפר,אליכין,68000.0,11,2022-06-27,2022-06-27,הרכב שמור כחדש &lt;br/&gt;&lt;br/&gt;פירו...,,107000,
95,סובארו,2013,סובארו XV,3,אוטומטית,2000,בנזין,,,נס ציונה - רחובות,נס ציונה,58000.0,11,2022-06-23,2022-06-23,"שמור ומטופל בזמן, יפה מבחוץ ומבפנים, זריז יעיל...",,138000,
96,סובארו,2019,סובארו Outback,1,אוטומטית,2500,בנזין,,,מושבים במרכז,בית נחמיה,155000.0,3,2022-05-29,2022-05-29,רכב סובארו פורסטר החדשה 2019 שמורה כמו חדשה,,51400,
97,סובארו,2008,סובארו B3,5,אוטומטית,1500,בנזין,,,ירושלים והסביבה,גבע בנימין,10000.0,8,2022-05-26,2022-05-26,,,170000,
