In [25]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from random import uniform
from time import sleep
import pandas as pd
import sys
sys.path.append('../app')
from RandomFirefoxProfile import RandomFirefoxProfile

class GoogleSearchSelenium:
    
    def __init__(self, name: str = "Albert Einstein", headless: bool = True):
        self.BASE_URL = 'https://www.google.com/search?q='
        self.name = name
        self.headless = headless
        self.driver = None
    
    def get_driver(self):
        """Initialisiert den Firefox WebDriver."""
        if self.driver is not None:
            return self.driver
        
        options = webdriver.FirefoxOptions()
        if self.headless:
            options.add_argument("--headless")

        # Docker-spezifische Optionen
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")

        profile = RandomFirefoxProfile.create()
        options.profile = profile

        try:
            service = Service(executable_path="/usr/local/bin/geckodriver")
            self.driver = webdriver.Firefox(options=options, service=service)
            self.random_sleep(1, 2)  # Pause nach Driver-Start
        except Exception as e:
            print(f"‚ö†Ô∏è Fehler beim Starten des WebDrivers mit Service: {e}")
            try:
                self.driver = webdriver.Firefox(options=options)
                self.random_sleep(1, 2)  # Pause nach Driver-Start
            except Exception as e:
                print(f"‚ö†Ô∏è Fehler beim Starten des WebDrivers: {e}")

        return self.driver
    
    def close_driver(self):
        """Beendet den WebDriver sicher."""
        if self.driver is not None:
            try:
                self.random_sleep(0.5, 1)  # Kurze Pause vor dem Schlie√üen
                self.driver.quit()
                self.driver = None
            except Exception as e:
                print(f"‚ö†Ô∏è Fehler beim Schlie√üen: {e}")
            finally:
                self.driver = None
    
    def random_sleep(self, min_seconds=1.0, max_seconds=3.0):
        """Zuf√§llige Wartezeit."""
        sleep_time = uniform(min_seconds, max_seconds)
        sleep(sleep_time)
        return sleep_time
    
    def access_denied_check(self):
        """Pr√ºft auf Access Denied oder CAPTCHA."""
        try:
            self.random_sleep(0.5, 1)  # Kurze Pause vor dem Check
            page_html = self.driver.page_source if self.driver else ""
            if "<h1>Access denied</h1>" in page_html or "captcha" in page_html.lower():
                print("‚ö†Ô∏è Access denied oder CAPTCHA detected.")
                return True
            return False
        except Exception as e:
            print(f"‚ö†Ô∏è Fehler bei Access Check: {e}")
            return False
    
    def accept_cookies(self):
        """Akzeptiert Cookie-Banner."""
        try:
            self.random_sleep(1, 2)
            wait = WebDriverWait(self.driver, 5) if self.driver else None
            
            # Verschiedene Cookie-Button-Selektoren
            cookie_selectors = [
                (By.XPATH, "//button[contains(., 'Accept') or contains(., 'Akzeptieren') or contains(., 'Alle akzeptieren')]"),
                (By.ID, "L2AGLb"),  # Google's "Alle akzeptieren" Button
                (By.XPATH, "//button[@aria-label='Accept all']")
            ]
            
            for by, selector in cookie_selectors:
                try:
                    cookie_button = wait.until(
                        EC.element_to_be_clickable((by, selector))
                    ) if wait else None
                    if cookie_button:
                        cookie_button.click()
                        self.random_sleep(1, 2)  # Pause nach Cookie-Accept
                        print("‚úÖ Cookie-Banner akzeptiert")
                        return
                except:
                    continue
                    
        except Exception as e:
            print(f"‚ÑπÔ∏è Kein Cookie-Banner gefunden (bereits akzeptiert?)")
    
    def extract_search_links(self, num_links=3):
        """Extrahiert Links von der Google-Suchergebnisseite."""
        links = []
        
        try:
            # Warte auf Suchergebnisse
            self.random_sleep(1, 2)  # Pause vor der Suche
            wait = WebDriverWait(self.driver, 20)
            wait.until(EC.presence_of_element_located((By.ID, "search")))
            
            self.random_sleep(0.5, 1.5)  # Pause nach dem Laden der Ergebnisse
            
            # Verschiedene Selektoren f√ºr Suchergebnisse
            selectors = [
                (By.CSS_SELECTOR, "div.g a[href]"),
                (By.CSS_SELECTOR, "div#search a[href]"),
                (By.XPATH, "//div[@id='search']//a[@href]")
            ]
            
            search_results = []
            for by, selector in selectors:
                try:
                    search_results = self.driver.find_elements(by, selector)
                    if search_results:
                        print(f"‚úÖ Suchergebnisse gefunden mit: {selector[:50]}...")
                        break
                except:
                    continue
            
            for result in search_results:
                try:
                    href = result.get_attribute('href')
                    if href and href.startswith('http') and 'google.com' not in href and 'youtube.com' not in href:
                        links.append(href)
                        self.random_sleep(0.2, 0.5)  # Kurze Pause zwischen Link-Extraktionen
                        if len(links) >= num_links:
                            break
                except:
                    continue
            
            print(f"‚úÖ {len(links)} Links extrahiert")
            
        except Exception as e:
            print(f"‚ùå Fehler beim Extrahieren der Links: {e}")
        
        return links
    
    def extract_text_from_url(self, url):
        """Extrahiert Text von einer URL."""
        try:
            print(f"üåê √ñffne: {url}")
            self.driver.get(url)
            self.random_sleep(2, 4)  # Pause nach dem Laden der Seite
            
            # Warte auf Body-Element
            WebDriverWait(self.driver, 20).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            
            self.random_sleep(1, 2)  # Zus√§tzliche Pause f√ºr dynamische Inhalte
            
            # Extrahiere Text
            body = self.driver.find_element(By.TAG_NAME, "body")
            text = body.text
            
            # Bereinige Text
            text = ' '.join(text.split())
            
            print(f"‚úÖ {len(text)} Zeichen extrahiert")
            return text
            
        except Exception as e:
            error_msg = f"Fehler beim Abrufen von {url}: {str(e)}"
            print(f"‚ùå {error_msg}")
            return error_msg
    
    def search_and_extract(self, num_links=3):
        """Hauptfunktion: Sucht und extrahiert Text von den ersten Links."""
        print(f"üîç Suche nach: {self.name}")
        
        # 1. Driver initialisieren
        self.get_driver()
        
        try:
            # 2. Google-Suche √∂ffnen
            search_url = f"{self.BASE_URL}{self.name}"
            print(f"üåê √ñffne: {search_url}")
            self.driver.get(search_url)
            self.random_sleep(2, 4)  # Pause nach dem Laden der Suchseite
            
            # 3. Cookie-Banner akzeptieren
            self.accept_cookies()
            self.random_sleep(1, 2)  # Pause nach Cookie-Accept
            
            # 4. Access Denied pr√ºfen
            if self.access_denied_check():
                self.close_driver()
                sleep_time = uniform(15, 20)
                print(f"‚ö†Ô∏è Zugriff verweigert. Warte {round(sleep_time, 1)} Sekunden...")
                sleep(sleep_time)
                return self.search_and_extract(num_links)  # Rekursiver Aufruf
            
            # 5. Links extrahieren
            links = self.extract_search_links(num_links)
            
            if not links:
                print("‚ùå Keine Links gefunden!")
                return []
            
            print(f"\n{'='*80}")
            print(f"Gefundene Links ({len(links)}):")
            for i, link in enumerate(links, 1):
                print(f"{i}. {link}")
            print('='*80)
            
            self.random_sleep(1, 2)  # Pause vor dem Extrahieren der Texte
            
            # 6. Text von jedem Link extrahieren
            results = []
            for i, link in enumerate(links, 1):
                print(f"\n{'='*80}")
                print(f"Link {i}/{len(links)}")
                print('='*80)
                
                text = self.extract_text_from_url(link)
                
                # Vorschau
                preview = text[:500] + "..." if len(text) > 500 else text
                print(f"\nVorschau:\n{preview}\n")
                
                results.append({
                    'url': link,
                    'text': text,
                    'text_length': len(text)
                })
                
                # Pause zwischen Anfragen
                if i < len(links):
                    self.random_sleep(3, 6)  # L√§ngere Pause zwischen verschiedenen Seiten
            
            return results
            
        except Exception as e:
            print(f"‚ùå Fehler beim Durchsuchen: {e}")
            import traceback
            traceback.print_exc()
            return []
        
        finally:
            # 7. Driver schlie√üen (immer!)
            self.close_driver()


# Beispielnutzung
person_name = "'Prof Dominique P. Pioletti'"
google_search = GoogleSearchSelenium(name=person_name, headless=False)
results = google_search.search_and_extract(num_links=3)

# Ergebnisse in DataFrame speichern
if results:
    df_results = pd.DataFrame(results)
    print("\n" + "="*80)
    print("Zusammenfassung:")
    print(df_results[['url', 'text_length']])
    display(df_results)
else:
    print("Keine Ergebnisse zum Anzeigen.")


üîç Suche nach: 'Prof Dominique P. Pioletti'
üé≤ Zuf√§lliges Profil erstellt:
   User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) Ap...
   Sprache: fr-FR
   Aufl√∂sung: 1536x864
‚ö†Ô∏è Fehler beim Starten des WebDrivers mit Service: Message: Unable to obtain driver for firefox; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location

üåê √ñffne: https://www.google.com/search?q='Prof Dominique P. Pioletti'
üåê √ñffne: https://www.google.com/search?q='Prof Dominique P. Pioletti'
‚ö†Ô∏è Access denied oder CAPTCHA detected.
‚ö†Ô∏è Access denied oder CAPTCHA detected.
‚ö†Ô∏è Zugriff verweigert. Warte 16.4 Sekunden...
‚ö†Ô∏è Zugriff verweigert. Warte 16.4 Sekunden...


KeyboardInterrupt: 

In [19]:
texts = [res['text'] for res in results if 'Fehler' not in res['text']]
texts

["Please review our privacy policy and cookie usage before continuing and agree using the button provided. Read our policy now Agree People Contact us Search Excellence in plant and microbial science Research & Impact Publications Careers & Study News & Events Support Us About Us HOME PEOPLE PROFESSOR CRIST√ìBAL UAUY Professor Crist√≥bal Uauy Director Delivering Sustainable Wheat (DSW), Building Robustness in Crops (BRiC) The Uauy lab is focused on using genetics and genomics to improve both yield and quality components in wheat. Their research comes under three main areas; Increasing yields Improving crop quality Genomic enabled technologies The lab uses molecular genetic approaches to identify genes involved in wheat productivity traits and enhance the translation of this knowledge into improved varieties for industry and consumers. They aim to understand the mechanisms by which these genes function in order to develop the most rational strategies to deploy these genes into commercia

In [17]:
import ollama



prompt = f"""
You are an expert text extractor and data structurer.

Your task:
Extract the main research interests from the following texts about a researcher.
Write the research fields shortly and concisely in sentences.

Texts:
{texts}

Negative Promt:
Do not make Bullet points or lists.
"""

response = ollama.chat(
    model="deepseek-r1:1.5b",
    messages=[{"role": "user", "content": prompt}],
    stream=True,
)

# Antwort puffern und gleichzeitig anzeigen
response_text = ""
for chunk in response:
    content = chunk.get("message", {}).get("content", "")
    print(content, end="", flush=True)
    response_text += content

# Jetzt hast du alles gesammelt
print("\n\n‚úÖ Streaming beendet.")

### Final Final Answer Answer:: ** **QuestionsQuestions and and Answers Answers Based Based on on the the Original Original Query Query**

**

########  11.. ** **WhereWhere is is the the first first evidence evidence for for the the use use of of gold gold??****  
  
     - - ** **AnswerAnswer**:**: The The first first evidence evidence for for using using gold gold to to make make stone stone tools tools in in Africa Africa came came from from the the U U.S.S.. educational educational system system in in  22001144,, where where students students built built tools tools with with gold gold stones stones and and were were recognized recognized as as winners winners of of an an award award..  
  
     - - ** **KeyKey Reference Reference**:**: * *TheThe U U.S.S.. Students Students Built Built Tools Tools with with Gold Gold** by by An Analeale Tab Tabmermer et et al al.. ( (22001144).

).

########  22.. ** **WhatWhat are are the the best best quality quality slide slide scanners scanner