 **Scraping des donn√©es**

In [1]:
pip install requests beautifulsoup4



In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

base_url = "http://www.tunisie-annonce.com/AnnoncesImmobilier.asp"
annonces_data = []
MAX_PAGES = 5

# Fonction pour extraire les d√©tails depuis la page de l'annonce
def extraire_details(detail_url):
    try:
        response = requests.get(detail_url)
        response.encoding = "ISO-8859-1"
        soup = BeautifulSoup(response.text, "html.parser")

        cat, surface, description = "", "", ""
        date_insert, date_modif, localisation = "", "", ""

        # Cat√©gorie
        cat_td = soup.find("td", string="Cat√©gorie")
        if cat_td and cat_td.find_next_sibling("td"):
            cat = cat_td.find_next_sibling("td").get_text(strip=True)

        # Surface
        surface_td = soup.find("td", string="Surface")
        if surface_td and surface_td.find_next_sibling("td"):
            surface = surface_td.find_next_sibling("td").get_text(strip=True)

        # Description
        desc_td = soup.find("td", string="Texte")
        if desc_td and desc_td.find_next_sibling("td"):
            description = desc_td.find_next_sibling("td").get_text(strip=True)

        # Date ins√©r√©e
        insert_td = soup.find("td", string="Ins√©r√©e le")
        if insert_td and insert_td.find_next_sibling("td"):
            date_insert = insert_td.find_next_sibling("td").get_text(strip=True)

        # Date modifi√©e
        modif_td = soup.find("td", string="Modifi√©e le")
        if modif_td and modif_td.find_next_sibling("td"):
            date_modif = modif_td.find_next_sibling("td").get_text(strip=True)

        # Localisation
        localisation_td = soup.find("td", string="Localisation")
        if localisation_td and localisation_td.find_next_sibling("td"):
            localisation = localisation_td.find_next_sibling("td").get_text(" > ", strip=True)

        return cat, surface, description, date_insert, date_modif, localisation

    except Exception as e:
        print(f"‚ùå Erreur en acc√©dant √† {detail_url}")
        return "", "", "", "", "", ""

# Scraping avec pagination
page = 1
while page <= MAX_PAGES:
    print(f"üîé Scraping page {page}...")

    params = {"num": page}
    response = requests.get(base_url, params=params)
    response.encoding = "ISO-8859-1"

    if response.status_code != 200:
        print("‚ùå Erreur de chargement de la page.")
        break

    soup = BeautifulSoup(response.text, "html.parser")
    annonces = soup.find_all("tr", class_="Tableau1")

    if not annonces:
        print("‚úÖ Fin des annonces.")
        break

    for annonce in annonces:
        cols = annonce.find_all("td")
        if len(cols) >= 12:
            region = cols[1].get_text(strip=True)
            nature = cols[3].get_text(strip=True)
            type_annonce = cols[5].get_text(strip=True)
            texte_annonce = cols[7].get_text(strip=True)  # Champ visible
            prix = cols[9].get_text(strip=True)

            # Nettoyage du prix
            prix = re.sub(r"Dinar Tunisien.*", "", prix).replace("\xa0", " ").strip()

            # Lien vers page de d√©tails
            link_tag = cols[7].find("a")
            detail_url = ""
            if link_tag and "href" in link_tag.attrs:
                detail_url = "http://www.tunisie-annonce.com/" + link_tag["href"]

                # Extraire les d√©tails
                cat, surface, desc, date_insert, date_modif, localisation = extraire_details(detail_url)

                # S√©parer la localisation : Pays / Ville uniquement
                pays, ville = "", ""
                if localisation:
                    parts = localisation.split(" > ")
                    if len(parts) >= 1:
                        pays = parts[0]
                    if len(parts) >= 3:
                        ville = parts[2]  # On saute gouvernorat

                # Stocker les donn√©es
                annonces_data.append({
                    "Lien": detail_url,
                    "Cat√©gorie": cat,
                    "Surface": surface,
                    "Texte complet": desc,
                    "Texte annonce": texte_annonce,
                    "Ins√©r√©e le": date_insert,
                    "Modifi√©e le": date_modif,
                    "Pays": pays,
                    "Ville": ville,
                    "R√©gion": region,
                    "Nature": nature,
                    "Type": type_annonce,
                    "Prix (TND)": prix
                })

    page += 1

# Export en Excel
df = pd.DataFrame(annonces_data)
df.to_excel("annoncesv2.xlsx", index=False, engine="openpyxl")
print(f"‚úÖ {len(df)} annonces sauvegard√©es dans 'annoncesv2.xlsx'")


üîé Scraping page 1...
üîé Scraping page 2...
üîé Scraping page 3...
üîé Scraping page 4...
üîé Scraping page 5...
‚úÖ 125 annonces sauvegard√©es dans 'annoncesv2.xlsx'


**Stockage des donn√©es dans SQLite**

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import sqlite3

base_url = "http://www.tunisie-annonce.com/AnnoncesImmobilier.asp"
MAX_PAGES = 5

# Connexion √† SQLite
conn = sqlite3.connect("annonces.db")
cursor = conn.cursor()

# Cr√©ation de la table sans la colonne 'gouvernorat'
cursor.execute("""
    CREATE TABLE IF NOT EXISTS annonces (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        lien TEXT,
        categorie TEXT,
        surface TEXT,
        texte_complet TEXT,
        texte_annonce TEXT,
        inseree_le TEXT,
        modifiee_le TEXT,
        pays TEXT,
        ville TEXT,
        region TEXT,
        nature TEXT,
        type TEXT,
        prix_tnd TEXT
    )
""")
conn.commit()

def extraire_details(detail_url):
    try:
        response = requests.get(detail_url)
        response.encoding = "ISO-8859-1"
        soup = BeautifulSoup(response.text, "html.parser")

        cat, surface, description = "", "", ""
        date_insert, date_modif, localisation = "", "", ""

        # Cat√©gorie
        cat_td = soup.find("td", string="Cat√©gorie")
        if cat_td and cat_td.find_next_sibling("td"):
            cat = cat_td.find_next_sibling("td").get_text(strip=True)

        # Surface
        surface_td = soup.find("td", string="Surface")
        if surface_td and surface_td.find_next_sibling("td"):
            surface = surface_td.find_next_sibling("td").get_text(strip=True)

        # Description
        desc_td = soup.find("td", string="Texte")
        if desc_td and desc_td.find_next_sibling("td"):
            description = desc_td.find_next_sibling("td").get_text(strip=True)

        # Dates
        insert_td = soup.find("td", string="Ins√©r√©e le")
        if insert_td and insert_td.find_next_sibling("td"):
            date_insert = insert_td.find_next_sibling("td").get_text(strip=True)

        modif_td = soup.find("td", string="Modifi√©e le")
        if modif_td and modif_td.find_next_sibling("td"):
            date_modif = modif_td.find_next_sibling("td").get_text(strip=True)

        # Localisation
        localisation_td = soup.find("td", string="Localisation")
        if localisation_td and localisation_td.find_next_sibling("td"):
            localisation = localisation_td.find_next_sibling("td").get_text(" > ", strip=True)

        return cat, surface, description, date_insert, date_modif, localisation

    except Exception as e:
        print(f"‚ùå Erreur en acc√©dant √† {detail_url}")
        return "", "", "", "", "", ""

# Scraping
page = 1
while page <= MAX_PAGES:
    print(f"üîé Scraping page {page}...")

    params = {"num": page}
    response = requests.get(base_url, params=params)
    response.encoding = "ISO-8859-1"

    if response.status_code != 200:
        print("‚ùå Erreur de chargement de la page.")
        break

    soup = BeautifulSoup(response.text, "html.parser")
    annonces = soup.find_all("tr", class_="Tableau1")

    if not annonces:
        break

    for annonce in annonces:
        cols = annonce.find_all("td")
        if len(cols) >= 12:
            region = cols[1].get_text(strip=True)
            nature = cols[3].get_text(strip=True)
            type_annonce = cols[5].get_text(strip=True)
            texte_annonce = cols[7].get_text(strip=True)
            prix = cols[9].get_text(strip=True)

            # Nettoyage du prix
            prix = re.sub(r"Dinar Tunisien.*", "", prix).replace("\xa0", " ").strip()

            # Lien vers page de d√©tails
            link_tag = cols[7].find("a")
            if link_tag and "href" in link_tag.attrs:
                detail_url = "http://www.tunisie-annonce.com/" + link_tag["href"]

                cat, surface, desc, date_insert, date_modif, localisation = extraire_details(detail_url)

                # Localisation : on garde seulement pays et ville
                pays, ville = "", ""
                if localisation:
                    parts = localisation.split(" > ")
                    if len(parts) >= 1:
                        pays = parts[0]
                    if len(parts) >= 3:
                        ville = parts[2]

                # Insertion dans SQLite
                cursor.execute("""
                    INSERT INTO annonces (
                        lien, categorie, surface, texte_complet, texte_annonce,
                        inseree_le, modifiee_le, pays, ville,
                        region, nature, type, prix_tnd
                    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                """, (
                    detail_url, cat, surface, desc, texte_annonce,
                    date_insert, date_modif, pays, ville,
                    region, nature, type_annonce, prix
                ))

    page += 1

# Sauvegarder
conn.commit()
conn.close()

print("‚úÖ Donn√©es enregistr√©es dans la base SQLite 'annonces.db'")


üîé Scraping page 1...
üîé Scraping page 2...
üîé Scraping page 3...
üîé Scraping page 4...
üîé Scraping page 5...
‚úÖ Donn√©es enregistr√©es dans la base SQLite 'annonces.db'


In [7]:
import sqlite3
pd.read_sql("SELECT * FROM annonces", sqlite3.connect("/content/annonces.db"))


Unnamed: 0,id,lien,categorie,surface,texte_complet,texte_annonce,inseree_le,modifiee_le,pays,ville,region,nature,type,prix_tnd
0,1,http://www.tunisie-annonce.com/DetailsAnnonceI...,Offres>Terrain>Terrain nu,340¬†m¬≤,ÿßÿ±ÿ∂ ÿµÿßŸÑÿ≠ÿ© ŸÑŸÑÿ®ŸÜÿßÿ° ŸÖÿ≥ÿßÿ≠ÿ© 340 ŸÖÿ™ÿ± ŸÖÿ±ÿ®ÿπ ŸÅŸâ ŸÖŸÜŸàÿ®ÿ© ŸÖ...,Terrain nu manouba,10/12/2024,22/03/2025,Tunisie,Manouba,La Mannouba,Terrain,Terrain nu,221 000
1,2,http://www.tunisie-annonce.com/DetailsAnnonceI...,Offres>Terrain>Terrain nu,1 000¬†m¬≤,ÿßÿ±ÿ∂ ŸÑŸÑÿ®Ÿäÿπ ŸÖÿ≥ÿßÿ≠ÿ© 1000 ŸÖÿ™ÿ± ŸÖÿ±ÿ®ÿπ ŸÅŸâ ÿ¥ÿ±ŸÅÿ¥ ŸÖÿπÿ™ŸÖÿØŸäÿ© ...,Terrain nu chorfech,20/04/2024,22/03/2025,Tunisie,Ariana,Chorfech,Terrain,Terrain nu,150 000
2,3,http://www.tunisie-annonce.com/DetailsAnnonceI...,Offres>Terrain>Terrain nu,1 080¬†m¬≤,ÿßÿ±ÿ∂ ŸÑŸÑÿ®Ÿäÿπ ŸÅŸâ ÿ¨ÿ®ÿßÿ≥ ŸÖÿπÿ™ŸÖÿØŸäÿ© ÿ≥ŸäÿØŸâ ÿ´ÿßÿ®ÿ™ ŸàŸÑÿßŸäÿ© ÿßÿ±Ÿäÿß...,Jabbes terrain nu,18/10/2022,22/03/2025,Tunisie,Ariana,Jabbes,Terrain,Terrain nu,248 400
3,4,http://www.tunisie-annonce.com/DetailsAnnonceI...,Offres>Terrain>Terrain nu,322¬†m¬≤,je dix bien constructible aussi zone vil...,Zone villas ter 321m2 cons,22/07/2024,22/03/2025,Tunisie,Ariana,Chotrana 1,Terrain,Terrain nu,240 000
4,5,http://www.tunisie-annonce.com/DetailsAnnonceI...,Offres>Terrain>Terrain nu,776¬†m¬≤,zone acceptable (( pas de zone populair...,Aux choix 776m2 507m2,22/07/2024,22/03/2025,Tunisie,Ariana,Ariana,Terrain,Terrain nu,271 000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,121,http://www.tunisie-annonce.com/DetailsAnnonceI...,Offres>Location>Maisons,1¬†m¬≤,a louer une villa s4 moderne situ√©e √† la soukr...,Villa s4 avec piscine √†,20/03/2025,22/03/2025,Tunisie,Ariana,Chotrana 3,Location,Maisons,7 000
121,122,http://www.tunisie-annonce.com/DetailsAnnonceI...,Offres>Location>Appart. 1 pi√®ce,70¬†m¬≤,un s+1 totalement meubl√© a louer a soukra dan...,S1 meubl√© la soukra,02/04/2021,22/03/2025,Tunisie,Ariana,La Soukra,Location,App. 1 pi√®c,600
122,123,http://www.tunisie-annonce.com/DetailsAnnonceI...,Offres>Location>Maisons,120¬†m¬≤,un luxueux s+2 enti√®rement meubl√© a la soukra...,S2 meubl√© a soukra 90 d,02/04/2021,22/03/2025,Tunisie,Ariana,La Soukra,Location,Maisons,600
123,124,http://www.tunisie-annonce.com/DetailsAnnonceI...,Offres>Location>Maisons,70¬†m¬≤,un s+1 enti√®rement meubl√© a louer a soukra da...,S1 meubl√© a soukra,24/06/2021,22/03/2025,Tunisie,Ariana,La Soukra,Location,Maisons,550


**D√©veloppement de l‚ÄôAPI REST**

1-Installation des d√©pendances

In [14]:
!pip install fastapi uvicorn





In [10]:
 !pip install nest_asyncio pyngrok



In [11]:
 !pip install  openpyxl bs4



2-Impl√©mentation de l‚ÄôAPI avec FastAPI

In [12]:
!ngrok config add-authtoken 2ub8U4p67DYslVunez1y39LwG3G_679fw6hYuxbWNQZy3QTbW

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


=> API REST op√©rationnelle permetant d‚Äôextraire et d‚Äôacc√©der aux donn√©es sur POSTMAN.  

In [22]:
import requests
from bs4 import BeautifulSoup
from fastapi import FastAPI
import nest_asyncio
from pyngrok import ngrok
import uvicorn
import re

app = FastAPI()
nest_asyncio.apply()

annonces_data = []
MAX_PAGES = 2

def extraire_details(detail_url):
    try:
        response = requests.get(detail_url)
        response.encoding = "ISO-8859-1"
        soup = BeautifulSoup(response.text, "html.parser")

        cat, surface, description = "", "", ""
        date_insert, date_modif, localisation = "", "", ""

        cat_td = soup.find("td", string="Cat√©gorie")
        if cat_td and cat_td.find_next_sibling("td"):
            cat = cat_td.find_next_sibling("td").get_text(strip=True)

        surface_td = soup.find("td", string="Surface")
        if surface_td and surface_td.find_next_sibling("td"):
            surface = surface_td.find_next_sibling("td").get_text(strip=True)

        desc_td = soup.find("td", string="Texte")
        if desc_td and desc_td.find_next_sibling("td"):
            description = desc_td.find_next_sibling("td").get_text(strip=True)

        insert_td = soup.find("td", string="Ins√©r√©e le")
        if insert_td and insert_td.find_next_sibling("td"):
            date_insert = insert_td.find_next_sibling("td").get_text(strip=True)

        modif_td = soup.find("td", string="Modifi√©e le")
        if modif_td and modif_td.find_next_sibling("td"):
            date_modif = modif_td.find_next_sibling("td").get_text(strip=True)

        localisation_td = soup.find("td", string="Localisation")
        if localisation_td and localisation_td.find_next_sibling("td"):
            localisation = localisation_td.find_next_sibling("td").get_text(" > ", strip=True)

        return cat, surface, description, date_insert, date_modif, localisation

    except Exception as e:
        print(f"‚ùå Erreur dans {detail_url}")
        return "", "", "", "", "", ""

# Fonction de scraping
def scraper():
    global annonces_data
    annonces_data.clear()
    base_url = "http://www.tunisie-annonce.com/AnnoncesImmobilier.asp"

    page = 1
    while page <= MAX_PAGES:
        print(f"üîé Scraping page {page}...")

        params = {"num": page}
        response = requests.get(base_url, params=params)
        response.encoding = "ISO-8859-1"

        if response.status_code != 200:
            break

        soup = BeautifulSoup(response.text, "html.parser")
        annonces = soup.find_all("tr", class_="Tableau1")

        if not annonces:
            break

        for annonce in annonces:
            cols = annonce.find_all("td")
            if len(cols) >= 12:
                region = cols[1].get_text(strip=True)
                nature = cols[3].get_text(strip=True)
                type_annonce = cols[5].get_text(strip=True)
                texte_annonce = cols[7].get_text(strip=True)
                prix = cols[9].get_text(strip=True)

                prix = re.sub(r"Dinar Tunisien.*", "", prix).replace("\xa0", " ").strip()

                link_tag = cols[7].find("a")
                detail_url = ""
                if link_tag and "href" in link_tag.attrs:
                    detail_url = "http://www.tunisie-annonce.com/" + link_tag["href"]

                    cat, surface, desc, date_insert, date_modif, localisation = extraire_details(detail_url)

                    pays, ville = "", ""
                    if localisation:
                        parts = localisation.split(" > ")
                        if len(parts) >= 1:
                            pays = parts[0]
                        if len(parts) >= 3:
                            ville = parts[2]

                    annonces_data.append({
                        "Lien": detail_url,
                        "Cat√©gorie": cat,
                        "Surface": surface,
                        "Texte complet": desc,
                        "Texte annonce": texte_annonce,
                        "Ins√©r√©e le": date_insert,
                        "Modifi√©e le": date_modif,
                        "Pays": pays,
                        "Ville": ville,
                        "R√©gion": region,
                        "Nature": nature,
                        "Type": type_annonce,
                        "Prix (TND)": prix
                    })

        page += 1

    return annonces_data

# Endpoint GET
@app.get("/annonces")
def get_annonces():
    return {"total": len(annonces_data), "data": annonces_data}

# Endpoint POST
@app.post("/scrape")
def run_scraper():
    scraper()
    return {"message": "Scraping termin√©", "total": len(annonces_data)}

# Ngrok + Serveur
port = 8000
public_url = ngrok.connect(port)
print(f"üöÄ API accessible ici : {public_url}/docs")

uvicorn.run(app, host="0.0.0.0", port=port)


ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-28' coro=<Server.serve() done, defined at /usr/local/lib/python3.11/dist-packages/uvicorn/server.py:68> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/main.py", line 579, in run
    server.run()
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/server.py", line 66, in run
    return asyncio.run(self.serve(sockets=sockets))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 92, in run_until_complete
    self._run_once()
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 133, in _run_once
    handle._run()
  File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run
    s

üöÄ API accessible ici : NgrokTunnel: "https://f949-35-201-170-126.ngrok-free.app" -> "http://localhost:8000"/docs


INFO:     Started server process [275]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     102.157.126.242:0 - "GET / HTTP/1.1" 404 Not Found
INFO:     102.157.126.242:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO:     102.157.126.242:0 - "GET /docs HTTP/1.1" 200 OK
INFO:     102.157.126.242:0 - "GET /openapi.json HTTP/1.1" 200 OK
üîé Scraping page 1...


ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-1' coro=<Server.serve() done, defined at /usr/local/lib/python3.11/dist-packages/uvicorn/server.py:68> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/main.py", line 579, in run
    server.run()
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/server.py", line 66, in run
    return asyncio.run(self.serve(sockets=sockets))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 92, in run_until_complete
    self._run_once()
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 133, in _run_once
    handle._run()
  File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run
    se

üîé Scraping page 2...
INFO:     102.157.126.242:0 - "POST /scrape HTTP/1.1" 200 OK
INFO:     102.157.126.242:0 - "GET /annonces HTTP/1.1" 200 OK
üîé Scraping page 1...
üîé Scraping page 2...
INFO:     102.157.126.242:0 - "POST /scrape HTTP/1.1" 200 OK
INFO:     102.157.126.242:0 - "GET /annonces HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [275]
