## Summary assignment - part 1

https://github.com/RoniEpstein/Data-mining-and-machine-learning-assignment

In [1]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import datetime

import warnings
warnings.filterwarnings("ignore")

# Function to calculate the number of days until the start of a given month and year
def days_until_month_start(year, month):
    today = datetime.today()
    first_day_of_month = datetime(year, month, 1)
    difference = first_day_of_month - today
    return difference.days

In [2]:
def days_until_end_of_month(year,month):
    # Today's date
    today = datetime.date.today()
    
    # First day of the given month and year
    first_of_month = datetime.date(year, month, 1)
    
    # Last day of the given month
    if month == 12:
        last_of_month = datetime.date(year + 1, 1, 1) - datetime.timedelta(days=1)
    else:
        last_of_month = datetime.date(year, month + 1, 1) - datetime.timedelta(days=1)

    # Calculating days from today to the end of the month
    if today > last_of_month:
        # If today is past the end of the month, return 0 days
        return 0
    elif today < first_of_month:
        # If today is before the month starts, calculate from today to the last day of the month
        return (last_of_month - today).days 
    else:
        # Normal case within the month, calculate from today to the last day of the month
        return (last_of_month - today).days 


In [3]:
# Define the URL of the website and the car model to search for
car_model ="שברולט"
url = "https://www.ad.co.il/car"

In [4]:
# Send a request to the website and parse the content
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

In [5]:
# Find the section containing the car model
car_shev = soup.find_all("div", class_="fast-filters-children d-flex flex-row align-items-center me-3 mb-1")

In [6]:
# Extract the URL for the specific car model
for element in car_shev:
    anchor_tags = element.find_all("a")  # Find all anchor tags within this element
    for tag in anchor_tags:
        if car_model in tag.text:
            tag_model = tag['href']
            keywords_model = tag_model[tag_model.find("?"):]
            
keywords_model

'?sp261=13891'

__________________________

In [7]:
# Send a request to the car model's URL and parse the content
response = requests.get(url+keywords_model)
soup = BeautifulSoup(response.content, 'html.parser')

In [8]:
# Extract pagination URLs
pagination = soup.find('ul', class_="pagination justify-content-between justify-content-sm-center flex-wrap")
page_items = pagination.find_all('li', class_="page-item")
page_urls_all = [item.find('a')['href'] for item in page_items if item.find('a')]
page_urls = list(set(page_urls_all))
page_urls

['/car?sp261=13891&pageindex=3',
 '/car?sp261=13891&pageindex=4',
 '/car?sp261=13891',
 '/car?sp261=13891&pageindex=2']

In [9]:
# Function to scrape data IDs from each page
def scrape_data_ids(url_model):
    url = "https://www.ad.co.il"
    response = requests.get(url_model)
    soup = BeautifulSoup(response.text, 'html.parser')
    cards_container = soup.find('div', class_='cards-wrap s m l')
    data_ids = []
    cards = cards_container.find_all('div', class_='card-block')
    for card in cards:
        data_id = card.get('data-id')
        if data_id:  # Check if data-id is not None
            data_id = url + "/ad/" + str(data_id)
            data_ids.append(data_id)
    return data_ids

data_ids_pags = []

In [10]:
# Collect data IDs from all pages
for page_url in page_urls:
    full_url = 'https://www.ad.co.il' + page_url
    data_ids = scrape_data_ids(full_url)
    data_ids_pags.extend(data_ids)

print(f"Total data IDs collected: {len(data_ids_pags)}")

Total data IDs collected: 183


_____________

In [11]:
# Function to scrape car data from each car's page
def scrape_car_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    manufactor = car_model
    cars_card_div = soup.find('div', class_="d-flex justify-content-between") 
    cars_card = cars_card_div.find_all('h2', class_="card-title")
    if len(cars_card) == 1:
        price = None
    else:
        price = float(cars_card[1].get_text()[:-1].replace(',', '').replace('₪', '').strip()) if cars_card else None
    
    model_string = cars_card[0].get_text().split() if cars_card else 'N/A'  
    model = ' '.join(model_string[1:])

    cars_table = soup.find('table', class_="table table-sm mb-4")
    rows = cars_table.find_all('tr') if cars_table else []
       
    # Initialize variables 
    year = None
    hand = None
    gear = 'N/A'
    engine_capacity = None
    engine_type = 'N/A'
    area = 'N/A'
    city = 'N/A'     
    prev_ownership =  'N/A'
    curr_ownership = 'N/A'
    color = 'N/A'
    test_date = None
    km = None
        
    # Extract data from the table
    for row in rows:
        cells = row.find_all('td')
        if len(cells) == 2:
            key = cells[0].text.strip()
            value = cells[1].text.strip()

            if key == 'שנה':
                year = int(value)
            elif key == 'יד':
                hand = int(value)   
            elif key == 'ת. הילוכים':
                gear = value
            elif key == 'נפח':
                engine_capacity = int(value.replace(',', ''))  # Remove comma from engine capacity
            elif key == 'סוג מנוע':
                engine_type = value
            elif key == 'אזור':
                area = value
            elif key == 'עיר':
                city = value
            elif key == 'ק"מ':
                km = int(value.replace(',', ''))
            elif key == 'צבע':
                color = value
            elif key == 'בעלות קודמת':
                prev_ownership = value
            elif key == 'בעלות נוכחית':
                curr_ownership = value
            elif key == 'טסט עד':
                test_date = value
    
    test = int(days_until_end_of_month(int(test_date.split("/")[1]),int(test_date.split("/")[0]))) if test_date else None
    cre_date =soup.find_all('div',class_="px-3")[0].get_text().split()[-1] if soup.find_all('div',class_="px-3")[0] else 'N/A'
    repub_date = soup.find_all('div',class_="px-3")[1].get_text().split()[-1] if soup.find_all('div',class_="px-3")[0] else 'N/A'
    description = soup.find('p',class_="text-word-break").get_text().replace('\n', '').replace('\r', ' ') if soup.find('p',class_="text-word-break") else 'No description'
    pic_num = len(soup.find_all('div', class_='justify-content-center px-1')) if soup.find_all('div', class_='justify-content-center px-1') else None

    # Add each car's data as a list to the car_data list
    car_data =[manufactor, year, model, hand, gear, engine_capacity, engine_type, prev_ownership,
                      curr_ownership, area, city, price, pic_num, cre_date, repub_date, description, color, km,test]
    return car_data

In [12]:
# Create an empty DataFrame with the specified columns
df_b_f = pd.DataFrame(columns=['manufactor','Year','model','Hand','Gear','capacity_Engine','Engine_type','Prev_ownership',
                                         'Curr_ownership','Area','City','Price','Pic_num','Cre_date','Repub_date','Description','Color','Km',
                                         'Test'])

In [13]:
# Scrape car data for each car and add to the DataFrame
list_cars = []
for car_url in data_ids_pags:
    car_data = scrape_car_data(car_url)
    df_b_f.loc[len(df_b_f)]=car_data

In [51]:
df = df_b_f 

In [52]:
df= df[(df['Year'] >= 1980) & (df['Year'] <= 2011)]

df['Repub_date'] = pd.to_datetime(df['Repub_date'], format='%d/%m/%Y')
df['Cre_date'] = pd.to_datetime(df['Cre_date'], format='%d/%m/%Y')

df['Km'] = pd.to_numeric(df['Km'], errors='coerce')
df['Km'] = df['Km'].astype('Int64')

df['Pic_num'] = df['Pic_num'].fillna(0).astype(int)
df['Price'] = df['Price'].astype(float)
df['Test'] = pd.to_numeric(df['Test'], errors='coerce')
df['Test'] = df['Test'].astype('Int64')

df['Gear'] = df['Gear'].astype('category')
df['Engine_type'] = df['Engine_type'].astype('category')
df['Prev_ownership'] = df['Prev_ownership'].astype('category')
df['Curr_ownership'] = df['Curr_ownership'].astype('category')

df['Color'] = df['Color'].astype('string')
df['Area'] = df['Area'].astype('string')

# Replace HTML entities with actual characters

# Replace HTML entities with actual characters
df['Description'] = df['Description'].replace({'&lt;': '<', '&gt;': '>'}, regex=True)

# Replace problematic HTML-like tags with a placeholder
df['Description'] = df['Description'].replace({'<br/>': 'PLACEHOLDER'}, regex=True)

# Remove placeholders or unwanted tags if necessary
df['Description'] = df['Description'].replace({'PLACEHOLDER': ' '}, regex=True)

# Convert the column to string type
df['Description'] = df['Description'].astype('string')
df['City'] = df['City'].astype('string')

In [53]:
df['Description'].unique()

<StringArray>
[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           'שברולט מאליבו, בצבע זהב מטלי, יד שלישית, 150640 קילומטרים מצב מכני מצוין ,4 צמיגים חדשים חווית נהיגה מושלמת ,שמור ',
                                                                                                                                                                                                                                                                                                                                                        

In [54]:
df.dtypes 

manufactor                 object
Year                        int64
model                      object
Hand                        int64
Gear                     category
capacity_Engine             int64
Engine_type              category
Prev_ownership           category
Curr_ownership           category
Area               string[python]
City               string[python]
Price                     float64
Pic_num                     int32
Cre_date           datetime64[ns]
Repub_date         datetime64[ns]
Description        string[python]
Color              string[python]
Km                          Int64
Test                        Int64
dtype: object

________________________________________

## Supply_score

In [55]:
df['model'].unique()

array(['מאליבו', 'אפלנדר', 'ספארק', 'אוואו', 'אלרו', 'קרוז', 'אופטרה',
       'קורבט', 'אבאו', 'טראקס', 'קורסיקה', 'אפיקה', 'סוניק'],
      dtype=object)

In [56]:
# Dictionary to map Hebrew model names to English model names
hebrew_to_english = {
    'קרוז': 'Cruz',
    'אורלנדו': 'Orlando',
    'מליבו': 'Malibu',
    'ספארק': 'Spark',
    'אקווינוקס': 'Equinox',
    'סוניק': 'Sonic',
    'קורבט': 'Corvette',
    'אבאו': 'Avo',
    'אימפלה': 'Impala',
    'אופטרה': 'Optera',
    'קורבט Z06': 'Corvette Z06',
    'אוואו': 'Aww',
    'קורסיקה': 'Corsica',
    'קאמרו': 'Camaro',
    'טראקס': 'Trax',
    'אפיקה': 'Epica',
    'קוואליר': 'Cavalier',
    'אפלנדר': 'Uplander',
    'קרוז החדשה': 'New Cruz',
    'קאמארו': 'Camaro',
    'סלבריטי': 'Celebrity',
    'אלרו': 'Alero'
}

In [57]:
# Function to convert Hebrew model names to English
def convert_model_name(model_name):
    return hebrew_to_english.get(model_name, model_name)  # Return the English name if found, otherwise return the original name

In [58]:
# Apply the conversion function and create a new column
df['english-model'] = df['model'].apply(convert_model_name)

In [59]:
# URL to the API
api_url = 'https://data.gov.il/api/3/action/datastore_search?resource_id=5e87a7a1-2f6f-41c1-8aec-7216d52a6cf6'

# Fetch data from the API
response = requests.get(api_url)
data = response.json()

# Convert the data to a DataFrame
api_df = pd.DataFrame(data['result']['records'])

In [60]:
# Merge the DataFrame with the API data on the relevant columns
merged_df = pd.merge(df, api_df, left_on=['manufactor', 'english-model', 'Year'],
                     right_on=['tozar', 'kinuy_mishari', 'shnat_yitzur'], how='left')

# Calculate the supply index by counting the number of matches
supply_index = merged_df.groupby(['manufactor', 'english-model', 'Year']).size().reset_index(name='Supply_score')

# Merge the supply index back into the original DataFrame
df = pd.merge(df, supply_index, on=['manufactor', 'english-model', 'Year'], how='left')

In [61]:
# Drop the temporary 'english-model' column
df.drop('english-model', axis=1, inplace=True)

In [62]:
df.to_csv('מטלה חלק 1.csv', index=False, encoding='utf-8-sig')

In [63]:
df.dtypes 

manufactor                 object
Year                        int64
model                      object
Hand                        int64
Gear                     category
capacity_Engine             int64
Engine_type              category
Prev_ownership           category
Curr_ownership           category
Area               string[python]
City               string[python]
Price                     float64
Pic_num                     int32
Cre_date           datetime64[ns]
Repub_date         datetime64[ns]
Description        string[python]
Color              string[python]
Km                          Int64
Test                        Int64
Supply_score                int64
dtype: object

In [None]:
df