In [104]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

cross_url = "https://www.immobiliare.it/annunci"

# Base URLs for "compra" and "affitta"
BASE_URLS = {
    "compra": "https://www.immobiliare.it/vendita-case/torino/",
    "affitta": "https://www.immobiliare.it/affitto-case/torino/"
}

headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }


def get_location(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    rrr = soup.find_all("script", type="application/json", id="__NEXT_DATA__")
    # Parse the JSON data from the content of the <script> tag
    json_data = json.loads(rrr[0].string)

    location = json_data["props"]["pageProps"]["detailData"]["realEstate"]["properties"][0]["location"]
    return location

# Function to scrape a single page of listings
def get_single_page_results(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    
    listings = soup.find_all("li", class_="nd-list__item in-searchLayoutListItem")
    #print(listings)
    return listings

def get_id_value(listing):
    # Directly use the BeautifulSoup Tag (no need to re-parse with BeautifulSoup)
    div_with_id = listing.find('div', {'class': 'in-listingCard'})
    if div_with_id:
        # Get the id attribute
        id_value = div_with_id.get('id')
        return id_value
    return None

def get_annuncio_url(annuncio_id):
    return f'{cross_url}/{annuncio_id}/'

def get_annuncio_caratteristiche(url):
    soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
    features_section = soup.find_all("div", class_="re-featuresItem")

    row_data = {}
    
    # Iterate through each feature item
    for item in features_section:
        title_tag = item.find("dt", class_="re-featuresItem__title")
        description_tag = item.find("dd", class_="re-featuresItem__description")
        
        # Extract text if the tags exist
        title = title_tag.text.strip() if title_tag else None
        description = description_tag.text.strip() if description_tag else None
        
        # Add the title and description to the dictionary
        if title and description:
            row_data[title] = description

    return row_data

def get_max_pages(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    rrr = soup.find_all("div", class_="nd-button nd-button--ghost is-disabled in-pagination__item is-mobileHidden")

    # Extract numbers from the text of the elements
    page_numbers = []
    for item in rrr:
        # Extract the text, strip spaces and try to convert it to an integer
        text = item.get_text(strip=True)
        if text.isdigit():  # Check if the text is a digit (i.e., page number)
            return(int(text))

def scrape_pages():
    rows = list()
    for listing_type, base_url in BASE_URLS.items():
        max_pages = get_max_pages(base_url)
        for page in range(1, max_pages + 1):
            print(f"Scraping {listing_type} - Page {page}/{max_pages}")
            page_url = f"{base_url}?pag={page}"
            results = get_single_page_results(page_url)
            for fs in results:
                id = get_id_value(fs)
                url = get_annuncio_url(id)
                row = get_annuncio_caratteristiche(url)
                location = get_location(url)
                row["operation"] = listing_type
                row["url"] = url
                row["latitude"] = location["latitude"]
                row["longitude"] = location["longitude"]
                rows.append(row)
    return rows

In [None]:
res = scrape_pages()

Scraping compra - Page 1/80
Scraping compra - Page 2/80
Scraping compra - Page 3/80
Scraping compra - Page 4/80
Scraping compra - Page 5/80
Scraping compra - Page 6/80
Scraping compra - Page 7/80
Scraping compra - Page 8/80
Scraping compra - Page 9/80
Scraping compra - Page 10/80
Scraping compra - Page 11/80
Scraping compra - Page 12/80
Scraping compra - Page 13/80
Scraping compra - Page 14/80
Scraping compra - Page 15/80
Scraping compra - Page 16/80
Scraping compra - Page 17/80
Scraping compra - Page 18/80
Scraping compra - Page 19/80
Scraping compra - Page 20/80
Scraping compra - Page 21/80
Scraping compra - Page 22/80
Scraping compra - Page 23/80
Scraping compra - Page 24/80
Scraping compra - Page 25/80
Scraping compra - Page 26/80
Scraping compra - Page 27/80
Scraping compra - Page 28/80
Scraping compra - Page 29/80
Scraping compra - Page 30/80
Scraping compra - Page 31/80
Scraping compra - Page 32/80
Scraping compra - Page 33/80
Scraping compra - Page 34/80
Scraping compra - Page 

In [106]:
results_df = pd.DataFrame(res)
results_df.to_csv("scraping.csv")

In [107]:
results_df.head(3)

Unnamed: 0,Tipologia,Contratto,Piano,Piani edificio,Ascensore,Superficie,Locali,Camere da letto,Cucina,Bagni,...,Valore perizia,Deposito cauzionale,Rialzo minimo,Rialzo minimo in caso di gara,Luogo vendita,Termine presentazione,Procedura,Tribunale,Luogo presentazione,Cauzione
0,Appartamento | Intera proprietà | Classe immob...,Vendita,"Interrato (-2), 1",1,No,123 m² | commerciale 139 m²,3,2,Cucina abitabile,1,...,,,,,,,,,,
1,Appartamento | Intera proprietà | Classe immob...,Vendita - Scarica capitolato,1,4,Sì,115 m²,4,2,Cucina abitabile,2,...,,,,,,,,,,
2,Appartamento | Intera proprietà | Classe immob...,Vendita,"Piano terra, 2",5,Sì,90 m²,3,2,Cucina angolo cottura,1,...,,,,,,,,,,
