# Data Mining and Price Forecasting in the Automobile Market.

## imports

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import datetime

## Web Scraping Car Data from ad.co.il

In [2]:
# coding: utf-8

stop_con = True
all_ids = []
all_pics = []
page_num = 1

while stop_con:
    url = "https://www.ad.co.il/car?sp261=13911&pageindex=" + str(page_num)
    html = requests.get(url)
    soup = BeautifulSoup(html.content, 'html.parser')

    # Extract data-id and data-images attributes
    data_ids = []
    data_images = []

    for card_block in soup.find_all(class_='card-block'):
        # Check if the 'data-id' and 'data-images' attributes exist
        if 'data-id' in card_block.attrs and 'data-images' in card_block.attrs:
            data_ids.append(card_block['data-id'])
            data_images.append(card_block['data-images'])
            
    if len(data_ids) == 0:
        stop_con = False
    else:
        page_num += 1
        all_ids.extend(data_ids)
        all_pics.extend(data_images)

# Print lengths of the final lists
print(len(all_ids))
print(len(all_pics))

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'} 

response = requests.get(url, headers= headers)
if response.status_code == 200:
    print("Success")
else:
    print("Failure")



118
118
Success


## Function to Extract Car Details from Web Pages

In [3]:
def get_all_details(base_url: str, posts):
    cars = []

    for car in posts:
        json = dict()
        url = base_url + car

        response = requests.get(url)
        resulte_page = BeautifulSoup(response.content, 'html.parser')
        
        try:
            dates = resulte_page.find('div', class_='d-flex flex-row align-items-center justify-content-center flex-wrap').get_text().split('\n')
        except AttributeError:
            dates = []

        try:
            price_model = resulte_page.find('div', class_='d-flex justify-content-between').get_text().split(' ')
        except AttributeError:
            price_model = []

        try:
            description = resulte_page.find('p', class_='text-word-break').get_text()
        except AttributeError:
            description = ''

        try:
            car_detail = resulte_page.find_all('td')
        except AttributeError:
            car_detail = []

        car_dtail = str(car_detail).split('\n')
        
        # Clearing unnecessary tags and characters from the list
        cleaned_description = description.replace('\r', '').replace('\n', '').replace('\'', '')
        cleaned_list = [re.sub(r'<[^>]+>', '', item).strip() for item in car_dtail]
        
        # Removing characters and strings that are not required
        cleaned_list = [re.sub(r'[\r\n\t]', '', item) for item in cleaned_list if item and not re.match(r'^[\.,/"\s]+$', item)]
        cleaned_dates = [item for item in dates if item.strip()]
        
        # Create dictionary 
        json = {cleaned_list[i]: cleaned_list[i + 1] for i in range(1, len(cleaned_list) - 1, 2)}
        json['Description'] = cleaned_description
        json['manufactor'] = 'פורד' 
        
        if len(price_model) > 1:
            json['model'] = price_model[1].split('\n')[0] if '\n' in price_model[1] else price_model[1]
            json['Price'] = price_model[1].split('\n')[1] if '\n' in price_model[1] else ''

        for item in cleaned_dates:
            if ':' in item:
                key, value = item.split(':')
                key = key.strip()
                value = value.strip()
                json[key] = value

        # Changing the key names in the dictionary
        new_keys = {
            'שנה': 'Year',
            'יד': 'Hand',
            'ת. הילוכים': 'Gear',
            'נפח': 'capacity_Engine',
            'סוג מנוע': 'Engine_type',
            'בעלות קודמת': 'Prev_ownership',
            'בעלות נוכחית': 'Curr_ownership',
            'אזור': 'Area',
            'עיר': 'City',
            'תאריך יצירה': 'Cre_date',
            'תאריך הקפצה אחרון': 'Repub_date',
            'צבע': 'Color',
            'ק"מ': 'Km',
            'טסט עד': 'Test'
        }
        
        for old_key, new_key in new_keys.items():
            if old_key in json:
                json[new_key] = json.pop(old_key)
        cars.append(json)
    
    return cars

## call to get_all_details function

In [4]:
Posts = get_all_details('https://www.ad.co.il/ad/', all_ids)

## Rearranging and Modifying Columns in the DataFrame


In [5]:
Posts_df = pd.DataFrame(Posts)
Posts_df.insert(loc=20, column='Pic_num', value=all_pics)

Posts_df = Posts_df.iloc[:, 4:23]
new_order_col = ['manufactor', 'Year', 'model', 'Hand', 'Gear','capacity_Engine', 'Engine_type', 'Prev_ownership', 'Curr_ownership', 'Area', 'City', 'Price', 'Pic_num', 'Cre_date', 'Repub_date', 'Description', 'Color', 'Km', 'Test']
Posts_df = Posts_df.reindex(columns= new_order_col)

Posts_df.head()



Unnamed: 0,manufactor,Year,model,Hand,Gear,capacity_Engine,Engine_type,Prev_ownership,Curr_ownership,Area,City,Price,Pic_num,Cre_date,Repub_date,Description,Color,Km,Test
0,,2012,,3,אוטומטית,1600,בנזין,פרטית,פרטית,חולון - בת ים,בת ים,,5,25/06/2024,25/06/2024,,לבן מטאלי,201281.0,
1,,2013,,3,אוטומטית,1600,בנזין,פרטית,פרטית,קיסריה והסביבה,אור עקיבא,,7,29/08/2022,18/06/2024,,אפור עכבר,178000.0,02/2025
2,,2020,,1,אוטומטית,1500,בנזין,ליסינג,ליסינג,גליל ועמקים,שפרעם,,8,21/06/2024,21/06/2024,,לבן,158000.0,
3,,2001,,6,אוטומטית,6000,דיזל,פרטית,פרטית,אילת והערבה,אילת,,19,20/04/2024,20/04/2024,,כחול,,
4,,2013,,4,אוטומטית,2000,בנזין,חברה,פרטית,ראש העין והסביבה,אלעד,,12,31/03/2024,31/03/2024,,אפור מטאלי,136000.0,09/2024


## Data Cleaning

In [6]:
Posts_df['capacity_Engine'] = Posts_df['capacity_Engine'].str.replace(',', '')
Posts_df['Km'] = Posts_df['Km'].str.replace(',', '')
Posts_df['Price'] = Posts_df['Price'].str.replace(',', '')
Posts_df['Price'] = Posts_df['Price'].replace('', np.nan)


AttributeError: Can only use .str accessor with string values!

## Calculating Days from Today to End of Month and Dropping Intermediate Column


In [None]:
Posts_df['Test'] = pd.to_datetime(Posts_df['Test'], format='%m/%Y')
Posts_df['Last Day of Month'] = Posts_df['Test'] + pd.offsets.MonthEnd(0)

today = pd.to_datetime(datetime.today().date())
Posts_df['Test'] = (Posts_df['Last Day of Month'] - today).dt.days

Posts_df = Posts_df.drop(['Last Day of Month'], axis=1)

## Converting Data Types in the DataFrame


In [None]:
Posts_df = Posts_df.astype({
    'manufactor': str, 
    'Year': int, 
    'model': str, 
    'Hand': int, 
    'Gear': 'category',
    'capacity_Engine': int, 
    'Engine_type': 'category', 
    'Prev_ownership': 'category',
    'Curr_ownership': 'category', 
    'Area': str, 
    'City': str, 
    'Price': float, 
    'Pic_num': int, 
    'Description': str, 
    'Color': str, 
    'Km': 'Int64',
    'Test': 'Int64'
 })

Posts_df.dtypes

## import to CSV file

In [None]:
Posts_df.to_csv('ford_1980_2024.csv', index=False, encoding='utf-8-sig')