In [4]:
import time
import pandas as pd
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime
import os

In [5]:
# Configurer Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("start-maximized")

driver = webdriver.Chrome(options=options)

# URL cible
url = "https://www.mergermarket.com/deals/search/results?q=~%28sort~%28announcementDate~%27desc%29~pageSize~25~page~0~exitMultiple~%28~%27turnover%29~criteria~%28~%28entity~%27target~attribute~%27sector~operator~%27is~value~%28values~%28~%29~dominantOnly~false%29%29~%28entity~%27target~attribute~%27geography~operator~%27not~value~%28values~%28~%27asia~%27AUS~%27samerica%29~dominantOnly~false%29%29~%28entity~%27target~attribute~%27company~operator~%27is~value~%28values~%28~%29%29%29~%28entity~%27anyEntity~attribute~%27sector~operator~%27is~value~%28values~%28~%29~entities~%28target~true~bidder~true~vendor~true%29%29%29~%28entity~%27deal~attribute~%27announcementDate~operator~%27range~value~%28preset~%27all%29%29~%28entity~%27text~attribute~%27freeText~operator~%27is~value~%28operator~%27and~query~%27~fields~%28deal~true~target~true~bidder~true~vendor~true%29%29%29~%28entity~%27deal~attribute~%27value~operator~%27between~value~%28from~%27~to~%27~undisclosedValue~%27includeUndisclosed~currency~%27EUR%29%29%29%29"

# Fonction pour se connecter au site
def login():
    driver.get(url)
    time.sleep(5)  # Attendre que la page de connexion charge

    # Entrer l'email
    email_field = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "username"))
    )
    email_field.send_keys("namar@da-cf.fr")

    # Cliquer sur "Next"
    next_button = driver.find_element(By.ID, "submint-main-signin-from")
    next_button.click()

    time.sleep(2)  # Attendre la transition vers le champ de mot de passe

    # Entrer le mot de passe
    password_field = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "password"))
    )
    password_field.send_keys("Noeamar2209#")

    # Cliquer sur "Submit"
    password_field.submit()

    time.sleep(5)  # Attendre la redirection vers la page des résultats

# Fonction pour convertir une date en objet datetime
def convertir_date(date_str):
    try:
        return datetime.strptime(date_str, "%d/%m/%Y")
    except Exception as e:
        print(f"Erreur lors de la conversion de la date : {date_str} - {e}")
        return None

# Fonction pour initialiser le fichier CSV
def initialize_csv(file_path):
    if not os.path.exists(file_path):
        with open(file_path, mode='w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=[
                "Target", "Buyer", "Seller", "EUR Value (m)", "Announced", 
                "Completed", "Revenue", "EBITDA", "EBIT", "Earnings", "Summary"
            ])
            writer.writeheader()

# Fonction pour ajouter une ligne au fichier CSV
def append_to_csv(file_path, row_data):
    with open(file_path, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=[
            "Target", "Buyer", "Seller", "EUR Value (m)", "Announced", 
            "Completed", "Revenue", "EBITDA", "EBIT", "Earnings", "Summary"
        ])
        writer.writerow(row_data)

# Fonction pour scraper une page
def scrape_page(file_path):
    rows = driver.find_elements(By.CSS_SELECTOR, "tr[data-testid='deal-list-item']")
    for row in rows:
        # Extraire les informations principales
        target = row.find_element(By.CSS_SELECTOR, "td[data-testid='targetTd']").text.strip()
        buyer = row.find_element(By.CSS_SELECTOR, "td[data-testid='bidderTd']").text.strip()
        seller = row.find_element(By.CSS_SELECTOR, "td[data-testid='vendorTd']").text.strip()
        deal_value = row.find_element(By.CSS_SELECTOR, "td[data-testid='dealValueTd']").text.strip()
        announced = row.find_element(By.CSS_SELECTOR, "td[data-testid='announcementDateTd']").text.strip()
        completed = row.find_element(By.CSS_SELECTOR, "td[data-testid='completedDateTd']").text.strip()

        # Revenus, EBITDA, EBIT, Earnings
        multiples = row.find_elements(By.CSS_SELECTOR, "td.deal-value")
        revenue = multiples[0].text.strip() if len(multiples) > 0 else "N/A"
        ebitda = multiples[1].text.strip() if len(multiples) > 1 else "N/A"
        ebit = multiples[2].text.strip() if len(multiples) > 2 else "N/A"
        earnings = multiples[3].text.strip() if len(multiples) > 3 else "N/A"

        # Récupérer le contenu du bouton "Summary"
        try:
            summary_button = row.find_element(By.CSS_SELECTOR, "button[data-testid='expand-row-button']")
            summary_button.click()
            time.sleep(2)  # Attendre que le contenu s'affiche
            summary_container = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "tr.more-details div[data-testid='summary-container']"))
            )
            target_description = summary_container.find_element(By.CSS_SELECTOR, "div[data-testid='target-description-0']").text.strip()
            deal_description = summary_container.find_element(By.CSS_SELECTOR, "div[data-testid='summary-deal-description']").text.strip()
            summary_content = f"Target Description: {target_description} | Deal Description: {deal_description}"
        except Exception as e:
            print(f"Erreur lors de l'extraction du résumé : {e}")
            summary_content = "N/A"

        # Ajouter les données extraites
        row_data = {
            "Target": target,
            "Buyer": buyer,
            "Seller": seller,
            "EUR Value (m)": deal_value,
            "Announced": announced,
            "Completed": completed,
            "Revenue": revenue,
            "EBITDA": ebitda,
            "EBIT": ebit,
            "Earnings": earnings,
            "Summary": summary_content
        }

        append_to_csv(file_path, row_data)  # Ajouter la ligne directement au fichier
        print(f"Ligne ajoutée : {row_data}")  # Afficher chaque ligne ajoutée

# Fonction pour naviguer à la page suivante
def go_to_next_page():
    try:
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "li[data-test-change-to-next-page='true'] a"))
        )
        next_button.click()
        time.sleep(5)  # Attendre le chargement de la page suivante
        return True
    except Exception as e:
        print(f"Impossible d'aller à la page suivante : {e}")
        return False

# Fonction pour reprendre à partir du dernier deal récupéré
def resume_from_last_deal(file_path):
    if not os.path.exists(file_path):
        return None

    existing_data = pd.read_csv(file_path)
    if existing_data.empty:
        return None

    last_target = existing_data.iloc[-1]["Target"]
    last_date = existing_data.iloc[-1]["Announced"]

    while True:
        rows = driver.find_elements(By.CSS_SELECTOR, "tr[data-testid='deal-list-item']")
        for row in rows:
            target = row.find_element(By.CSS_SELECTOR, "td[data-testid='targetTd']").text.strip()
            announced = row.find_element(By.CSS_SELECTOR, "td[data-testid='announcementDateTd']").text.strip()

            if target == last_target and announced == last_date:
                print(f"Reprise trouvée : {target}, {announced}")
                return

        if not go_to_next_page():
            break

# Script principal
try:
    # Se connecter au site
    login()

    # Chemin du fichier CSV
    output_file = "mergermarket_deals.csv"
    initialize_csv(output_file)  # Créer le fichier CSV immédiatement

    # Reprendre le scraping à partir du dernier deal
    resume_from_last_deal(output_file)

    # Date de fin pour arrêter le scraping
    date_fin = convertir_date("01/01/2019")

    while True:
        # Scraper les données de la page actuelle
        scrape_page(output_file)

        # Vérifier si on doit arrêter le scraping
        rows = pd.read_csv(output_file)
        last_dates = [convertir_date(date) for date in rows["Announced"] if date != "N/A"]
        if last_dates and min(last_dates) <= date_fin:
            print("Condition d'arrêt atteinte : Date de fin atteinte.")
            break

        # Aller à la page suivante
        if not go_to_next_page():
            break

    print("Scraping terminé.")
finally:
    driver.quit()

Reprise trouvée : Mentimeter AB, 24/06/2024
Erreur lors de l'extraction du résumé : Message: element click intercepted: Element <button data-testid="expand-row-button" class="ExpandRowButton__StyledExpandRowButton-sc-1db4cc7-0 dpEoHU" type="button">...</button> is not clickable at point (1394, 7). Other element would receive the click: <th scope="col"></th>
  (Session info: chrome=131.0.6778.205)
Stacktrace:
	GetHandleVerifier [0x00007FF6F0D980D5+2992373]
	(No symbol) [0x00007FF6F0A2BFD0]
	(No symbol) [0x00007FF6F08C590A]
	(No symbol) [0x00007FF6F0920F2E]
	(No symbol) [0x00007FF6F091E9CC]
	(No symbol) [0x00007FF6F091BBA6]
	(No symbol) [0x00007FF6F091AB01]
	(No symbol) [0x00007FF6F090CD40]
	(No symbol) [0x00007FF6F093F36A]
	(No symbol) [0x00007FF6F090C596]
	(No symbol) [0x00007FF6F093F580]
	(No symbol) [0x00007FF6F095F584]
	(No symbol) [0x00007FF6F093F113]
	(No symbol) [0x00007FF6F090A918]
	(No symbol) [0x00007FF6F090BA81]
	GetHandleVerifier [0x00007FF6F0DF6A2D+3379789]
	GetHandleVerifi

KeyboardInterrupt: 

In [6]:
import pandas as pd

# Charger le fichier CSV
data_file = "C:\\Users\\namar\\Documents\\poc_RAG\\Projet_test\\RAG_MnA\\mergermarket_deals.csv"
data = pd.read_csv(data_file)


# Ajouter les noms de colonnes devant les valeurs pour chaque ligne
def prefix_columns(row):
    return ', '.join([f"{col}: {row[col]}" for col in row.index])

# Appliquer la transformation ligne par ligne
data['Formatted_Row'] = data.apply(prefix_columns, axis=1)

# Extraire uniquement les lignes formatées dans une nouvelle colonne
formatted_data = data[['Formatted_Row']]

# Sauvegarder le fichier formaté
output_file = "C:\\Users\\namar\\Documents\\poc_RAG\\Projet_test\\RAG_MnA\\Data\\deals_MergerMarket.csv"
formatted_data.to_csv(output_file, index=False, header=False, encoding='utf-8')

print(f"Fichier formaté sauvegardé sous : {output_file}")

Fichier formaté sauvegardé sous : C:\Users\namar\Documents\poc_RAG\Projet_test\RAG_MnA\Data\deals_MergerMarket.csv


In [7]:
# Chemin du fichier CSV
file_path = "C:\\Users\\namar\\Documents\\poc_RAG\\Projet_test\\RAG_MnA\\Data\\deals_MergerMarket.csv"

# Liste des noms de colonnes
columns = [
    "Target", "Buyer", "Seller", "EUR Value (m)", "Announced", "Completed",
    "Revenue", "EBITDA", "EBIT", "Earnings", "Summary"
]

# Charger le fichier CSV dans un DataFrame et ajouter les noms de colonnes
try:
    df = pd.read_csv(file_path, header=None, names=columns)
    print(f"Fichier chargé avec {len(df)} lignes.")
except Exception as e:
    print(f"Erreur lors du chargement du fichier : {e}")
    exit()

# Supprimer les doublons en fonction des colonnes "Target", "Buyer", et "Announced"
original_count = len(df)
df_deduplicated = df.drop_duplicates(subset=["Target", "Buyer", "Announced"], keep="first")
deduplicated_count = len(df_deduplicated)
duplicates_removed = original_count - deduplicated_count

# Enregistrer le fichier sans doublons
try:
    df_deduplicated.to_csv(file_path, index=False, encoding='utf-8')
    print(f"Doublons supprimés : {duplicates_removed}.")
    print(f"Le fichier final contient {deduplicated_count} lignes.")
except Exception as e:
    print(f"Erreur lors de la sauvegarde du fichier : {e}")

Fichier chargé avec 394 lignes.
Doublons supprimés : 0.
Le fichier final contient 394 lignes.
