In [1]:
pip install google-search-results

Note: you may need to restart the kernel to use updated packages.


In [1]:
from serpapi import GoogleSearch

In [2]:
import os
import re
import requests
import xml.etree.ElementTree as ET
from urllib.parse import urlparse
from bs4 import BeautifulSoup

In [3]:
def fetch_content(url):
    try:
        headers = {
              'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        if 'application/pdf' in response.headers.get('Content-Type', ''):
            return response.content, 'pdf'
        
        else:
            soup = BeautifulSoup(response.text, 'html.parser')
            text = soup.get_text(separator='\n')  # Separate text with newlines
            return text, 'html'

    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch content from {url}. Error: {e}")
        return None, None

In [4]:
def clean_html_and_extract_text(html_text):
    # Parse the HTML document using BeautifulSoup
    soup = BeautifulSoup(html_text, 'html.parser')

    # Extract text content without HTML tags
    text = soup.get_text()

    # Remove special characters and extra whitespaces
    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespaces with a single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters

    # Normalize the text (convert to lowercase, for example)
    text = text.lower()

    return text  

In [5]:
def text_summarise(content):
    url = "https://www.semrush.com/goodcontent/api/summary-generator/generate-summary/"
    payload = {
        "text": content,
        "format": "paragraph",
        "length_penalty": 0
    }
    
    response = requests.post(url, json=payload)

    if response.status_code == 200:
        response_data = response.json()
        return response_data["summary"]
    
    else:
        return 'None'

In [6]:
def serp(question):
    params = {
        "q": question,
        "hl": "en",
        "google_domain": "google.com",
        "api_key": "5188c793499d254054685e70d1ed8befe3386f3ac21e1a52af718c7ec80ccc89"
    }
    search = GoogleSearch(params)
    results = search.get_dict()
    return results

In [7]:
directory_path = r"C:\Users\pkanv\OneDrive\Desktop\Acuration\probation task-2"
file_path = os.path.join(directory_path,'sunflare.txt')

In [8]:
questions = ['What is the official website of Sunflare solar company?'
'What is the bio of Sunflare solar company?',
'Where is the headquarters location of Sunflare solar company?',
'What are the products and services of Sunflare solar company?',
'What is the unique selling point (USP) of Sunflare solar company?',
'What is the value proposition of Sunflare solar company?',
'What is the target market of Sunflare solar company?',
'What is the market size of Sunflare solar company?',
'What is the competitive landscape of Sunflare solar company?',
'What are the business models of Sunflare solar company?',
'What are the revenue streams of Sunflare solar company?',
'What is the pricing model of Sunflare solar company?',
'What is the share price of Sunflare solar company?', 
'What is the profit margin of Sunflare solar company?',
'What is the total user base of Sunflare solar company?',
'How many paying customers do of Sunflare solar have?',
'What are the social media platforms of Sunflare solar company?',
'How many followers on social media platforms of Sunflare solar company',
'What is the funding info of Sunflare solar company?',
'What is the vision and mission of Sunflare solar company?',
'What are the key capabilities of Sunflare solar company?',
'What is the marketing strategy of Sunflare solar company?',
'What is the business strategy of Sunflare solar company?',
'Does Sunflare solar company have any collaborations or partnerships?',
'What is the cash flow of Sunflare solar company?']

In [9]:
for question in questions:
    results = serp(question)
    with open(file_path, 'a', encoding='utf-8') as file:
        file.write(f"\n\n\n")
        file.write(f"{question}\n")
        #results=serp(question)
        if "answer_box" in results:    
            answer=results["answer_box"].get('answer')
            file.write(f"\t{answer}\n")
            title=results["answer_box"].get('title')
            file.write(f"\t{title}\n")
            snippet=results["answer_box"].get('snippet')
            file.write(f"\t{snippet}\n")
            url=results["answer_box"].get('link')
            content,type = fetch_content(url)
            text = clean_html_and_extract_text(content)
            summarised_text = text_summarise(text)
            if summarised_text is not None:
                lines = summarised_text.split("\n")
                for line in lines:
                    file.write("\t" + line + "\n")
        if "organic_results" in results:
            for i in results["organic_results"]:
                title=i.get('title')
                file.write(f"\t{title}\n")
                snippet=i.get('snippet')
                file.write(f"\t{snippet}\n")                    
                url=i.get('link')
                content,type = fetch_content(url)
                text = clean_html_and_extract_text(content)
                summarised_text = text_summarise(text)
                if summarised_text is not None:
                    lines = summarised_text.split("\n")
                    for line in lines:
                        file.write("\t" + line + "\n")
        if "related_questions" in results:
            for i in results['related_questions']:
                question=i.get('question')
                file.write(f"\t{question}\n")
                title=i.get('title')
                file.write(f"\t\t{title}\n")
                snippet=i.get('snippet')
                file.write(f"\t\t{snippet}\n")
                url=i.get('link')
                content,type = fetch_content(url)
                text = clean_html_and_extract_text(content)
                summarised_text = text_summarise(text)
                if summarised_text is not None:
                    lines = summarised_text.split("\n")
                    for line in lines:
                        file.write("\t\t" + line + "\n")

Failed to fetch content from https://www.spaceweatherlive.com/en/solar-activity/solar-flares.html. Error: 403 Client Error: Forbidden for url: https://www.spaceweatherlive.com/en/solar-activity/solar-flares.html


TypeError: object of type 'NoneType' has no len()

In [10]:
results

{'search_metadata': {'id': '6592a8b90b5062fcbc863a76',
  'status': 'Success',
  'json_endpoint': 'https://serpapi.com/searches/7255da26a47fe65d/6592a8b90b5062fcbc863a76.json',
  'created_at': '2024-01-01 11:57:45 UTC',
  'processed_at': '2024-01-01 11:57:45 UTC',
  'google_url': 'https://www.google.com/search?q=What+is+the+official+website+of+Sunflare+solar+company%3FWhat+is+the+bio+of+Sunflare+solar+company%3F&oq=What+is+the+official+website+of+Sunflare+solar+company%3FWhat+is+the+bio+of+Sunflare+solar+company%3F&hl=en&sourceid=chrome&ie=UTF-8',
  'raw_html_file': 'https://serpapi.com/searches/7255da26a47fe65d/6592a8b90b5062fcbc863a76.html',
  'total_time_taken': 0.83},
 'search_parameters': {'engine': 'google',
  'q': 'What is the official website of Sunflare solar company?What is the bio of Sunflare solar company?',
  'google_domain': 'google.com',
  'hl': 'en',
  'device': 'desktop'},
 'search_information': {'query_displayed': 'What is the official website of Sunflare solar company