In [1]:
import os 
from pathlib import Path

In [2]:
os.chdir("../")
#os.chdir("../")

In [3]:
%pwd

'c:\\Users\\Asus\\vs_code\\Internship\\News_web_scraping'

In [4]:
import time
import requests
import datetime
from bs4 import BeautifulSoup

In [5]:
from pathlib import Path
from typing import Optional
from dataclasses import dataclass


@dataclass(frozen=True)
class WebScrapingConfig:
    headers         : str 
    extracted_path  : Path 

In [6]:
import os
from dotenv import load_dotenv
from News_web_scraping import logger
from News_web_scraping.constants import *
from News_web_scraping.utils.common import *
from News_web_scraping.entity.config_entity import (ModelConfig,
                                                      WebScrapingConfig)



class ConfigurationManager:
    def __init__(self,
                 config_filepath    = CONFIG_FILE_PATH,
                 params_filepath    = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)


    def get_model_config(self) -> ModelConfig:
        params  = self.params
        logger.info("Model config initialized")
        load_dotenv()
        model_config = ModelConfig(Model_name   = params.MODEL_NAME,
                                   temperature  = params.TEMPERATURE,
                                   api_key      = os.getenv("GROQ_API_KEY"))

        return model_config
    

    def get_web_scraping_config(self) -> WebScrapingConfig:
        config              = self.config
        web_scraping_config = self.config.Web_scraping

        logger.info("web scraping initialized")
        web_scraping_config = WebScrapingConfig(headers         = config.headers,
                                                extracted_path  = web_scraping_config.extracted_path)
        return web_scraping_config

In [7]:
manager             = ConfigurationManager()
web_scraping_config = manager.get_web_scraping_config() 
web_scraping_config.headers

[2024-10-04 13:32:03,123: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-10-04 13:32:03,123: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-04 13:32:03,125: INFO: 1393433061: web scraping initialized]


ConfigBox({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0', 'Accept-Encoding': 'gzip, deflate, br, zstd', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8', 'DNT': '1', 'Connection': 'close', 'Upgrade-Insecure-Requests': '1'})

In [8]:
from urllib.parse import urlparse

class WebPageScraper(ConfigurationManager):

    def __init__(self,URL):
        super().__init__()
        self.URL            = URL 
        self.soup           = None
        web_scraping_config = self.get_web_scraping_config()
        self.headers        = web_scraping_config.headers


    def fetch_and_parse(self): 
        """Validate the URL, fetch the webpage, and parse the content using BeautifulSoup."""
        parsed_url = urlparse(self.URL)
        if not parsed_url.scheme:
            self.URL = f"http://{self.URL}"  
        
        try:
            response    = requests.get(self.URL, headers=self.headers)
            response.raise_for_status()  # Check for HTTP errors
            self.soup   = BeautifulSoup(response.text, "html.parser")

        except requests.exceptions.MissingSchema:
            logger.error(f"Invalid URL: {self.URL} - Missing schema.")
            raise ValueError(f"Invalid URL: {self.URL} - Please provide a valid URL.")
        
        except requests.exceptions.ConnectionError:
            logger.error(f"Connection error while trying to reach {self.URL}.")
            raise ConnectionError(f"Failed to connect to {self.URL}.")
        
        except requests.exceptions.Timeout:
            logger.error(f"Timeout error while trying to reach {self.URL}.")
            raise TimeoutError(f"Request to {self.URL} timed out.")
        
        except requests.exceptions.HTTPError as err:
            logger.error(f"HTTP error occurred: {err}")
            raise RuntimeError(f"HTTP error occurred: {err}")
        
        except Exception as e:
            logger.error(f"An unexpected error occurred: {e}")
            raise RuntimeError(f"An unexpected error occurred: {e}")

    def clean_and_extract_text(self):
        """Remove unwanted tags and extract text from paragraph (<p>) tags."""
        if self.soup:
            for unwanted in self.soup(["script", "style", "a"]):
                unwanted.extract()
            paragraphs = self.soup.find_all('p')
            if paragraphs:
                return ' '.join(para.get_text(separator=' ', strip=True) for para in paragraphs)
            else:
                logger.warning("No paragraph tags found in the response.")
                return "No content found."
        return None 

In [9]:
#URL = "https://medium.com/@hamedkazemi/breaking-the-token-limits-a-journey-with-chatgpt-langchain-and-vectordb-embeddings-bonus-32352f97f5d4"
#URL = "https://indianexpress.com/article/explained/explained-global/why-chagos-islands-matter-why-uk-keeps-diego-garcia-base-9602682/?ref=newlist_hp"
#URL = "https://indianexpress.com/article/explained/explained-global/why-chagos-islands-matter-why-uk-keeps-diego-garcia-base-9602682/?ref=newlist_hp"
URL = "https://www.gsmarena.com/detailed_specs_for_samsungs_galaxy_a16_4g_and_5g_emerge-news-64767.php"

In [10]:
scraper     = WebPageScraper(URL=URL)
scraper.fetch_and_parse()
page_text   = scraper.clean_and_extract_text()

[2024-10-04 13:32:03,172: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-10-04 13:32:03,172: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-04 13:32:03,172: INFO: 1393433061: web scraping initialized]


In [11]:
page_text

"Login 02 October 2024  Samsung is expected to announce its entry-level Galaxy A16 in 4G and 5G versions in and we now have detailed spec sheets for both. The latest round of leaks comes on the heels of the and leaks and confirms everything we can expect to see from the two upcoming A-series phones. Galaxy A16 5G key specs and render As their names suggest, and are nearly the exact same phone with the major difference being their chipsets. In Europe where the new listings are from, A16 5G will be powered by Samsung’s own Exynos 1330 SoC. Past suggest other regions like South East Asia will see the A16 5G with MediaTek’s Dimensity 6300 at the helm. The A15 4G on the other hand will bring MediaTek’s old Helio G99 just like its . Galaxy A16 4G key specs and render The rest of the specs for both Galaxy A16 variants are identical. A 6.7-inch Super AMOLED display (FHD+ 90Hz), 50MP main cam, joined by 5MP ultrawide and 2MP macro modules, and a 5,000 mAh battery with 25W charging. Both phones 

In [12]:
from News_web_scraping.pipeline.stage_02_prompt_and_chain import PromptAndChainPipeline

In [13]:
obj     = PromptAndChainPipeline()
llm     = obj.main()

[2024-10-04 13:32:04,715: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-10-04 13:32:04,718: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-04 13:32:04,718: INFO: common: created directory at: artifacts]
[2024-10-04 13:32:04,719: INFO: configuration: Model config initialized]
[2024-10-04 13:32:04,721: INFO: Model: Model setup initialized]
[2024-10-04 13:32:05,248: INFO: Model: model----llama3-8b-8192----created]
[2024-10-04 13:32:05,248: INFO: prompt_and_chain: Prompting and chain has started]


In [14]:
response = llm.invoke(page_text)

[2024-10-04 13:32:05,948: INFO: _client: HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"]


In [15]:
import requests
from langchain_core.exceptions import OutputParserException
from requests.exceptions import RequestException, MissingSchema, ConnectionError, Timeout, HTTPError

import json
from json.decoder import JSONDecodeError

In [16]:
class CombiningPipeline:
    def __init__(self, URL):
        self.URL = URL
    
    def main(self):
        try:
            # Initialize the scraper and fetch the webpage content
            scraper = WebPageScraper(URL=self.URL)
            scraper.fetch_and_parse()
            page_text = scraper.clean_and_extract_text()

            # Handle the case where the content is 'No content found.'
            if page_text == "No content found.":
                logger.error("No content found on the webpage. Exiting process.")
                raise ValueError("The webpage did not contain any extractable content.")

            # Handle the case where extracted text is empty or only whitespace
            if not page_text or page_text.strip() == "":
                logger.error("The extracted content is empty or contains only whitespace.")
                raise OutputParserException("No data to extract from the provided webpage.")

            # Process the extracted text using the PromptAndChainPipeline
            prompt = PromptAndChainPipeline()
            chain = prompt.main()

            # Invoke the chain with the extracted page text
            return chain.invoke(page_text)
        
        # Handle output parsing errors if the output is invalid or empty
        except OutputParserException as e:
            logger.error(f"OutputParserException: {e}")
            return {
                "error": "Output parsing failed.",
                "message": str(e),
                "suggestion": "Ensure the webpage contains valid content for extraction."
            }

        # Handle invalid JSON outputs during the parsing phase
        except JSONDecodeError as e:
            logger.error(f"JSONDecodeError: {e}")
            return {
                "error": "Invalid JSON output.",
                "message": "The output from the model was not valid JSON.",
                "suggestion": "Check the model's output format and ensure it is correct."
            }

        # Handle requests-related issues such as timeouts, connection errors, etc.
        except requests.exceptions.RequestException as e:
            logger.error(f"RequestException while fetching the webpage: {e}")
            return {
                "error": "Failed to fetch the webpage.",
                "message": str(e),
                "suggestion": "Check the URL, internet connection, or server status."
            }

        # Handle cases where no content is found
        except ValueError as e:
            logger.error(f"ValueError: {e}")
            return {
                "error": "No content found.",
                "message": str(e),
                "suggestion": "Ensure the webpage has relevant content."
            }

        # General exception handling for any unexpected errors
        except Exception as e:
            logger.error(f"An unexpected error occurred: {e}")
            return {
                "error": "An unexpected error occurred.",
                "message": str(e),
                "suggestion": "Check the logs for more details or contact support."
            }


In [17]:
respone = CombiningPipeline(URL=URL).main()

[2024-10-04 13:32:06,086: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-10-04 13:32:06,088: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-04 13:32:06,089: INFO: 1393433061: web scraping initialized]
[2024-10-04 13:32:06,977: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-10-04 13:32:06,977: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-04 13:32:06,979: INFO: common: created directory at: artifacts]
[2024-10-04 13:32:06,979: INFO: configuration: Model config initialized]
[2024-10-04 13:32:06,982: INFO: Model: Model setup initialized]
[2024-10-04 13:32:07,352: INFO: Model: model----llama3-8b-8192----created]
[2024-10-04 13:32:07,352: INFO: prompt_and_chain: Prompting and chain has started]
[2024-10-04 13:32:08,021: INFO: _client: HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"]


In [21]:
type(response)

dict

In [27]:
URL = "https://python.langchain.com/docs/integrations/chat/groq/"

In [28]:
pipeline        = CombiningPipeline(URL=URL)
response        = pipeline.main()

[2024-10-04 13:35:19,657: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-10-04 13:35:19,661: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-04 13:35:19,661: INFO: 1393433061: web scraping initialized]
[2024-10-04 13:35:20,071: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-10-04 13:35:20,071: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-04 13:35:20,071: INFO: common: created directory at: artifacts]
[2024-10-04 13:35:20,076: INFO: configuration: Model config initialized]
[2024-10-04 13:35:20,076: INFO: Model: Model setup initialized]
[2024-10-04 13:35:20,431: INFO: Model: model----llama3-8b-8192----created]
[2024-10-04 13:35:20,432: INFO: prompt_and_chain: Prompting and chain has started]
[2024-10-04 13:35:20,973: INFO: _client: HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"]


In [25]:
type(response)

dict

In [26]:
response['Galaxy A16 Models']

{'Galaxy A16 Models': {'A16 5G': {'Chipset': 'Samsung Exynos 1330 (Europe) / MediaTek Dimensity 6300 (other regions)',
   'Display': '6.7-inch Super AMOLED (FHD+ 90Hz)',
   'Camera': '50MP main cam, 5MP ultrawide, 2MP macro',
   'Battery': '5,000 mAh with 25W charging',
   'RAM': '4GB',
   'Storage': '128GB (expandable via microSD)',
   'Android Updates': 'Up to 6 years',
   'Price': '€240 (Europe)'},
  'A16 4G': {'Chipset': 'MediaTek Helio G99',
   'Display': '6.7-inch Super AMOLED (FHD+ 90Hz)',
   'Camera': '50MP main cam, 5MP ultrawide, 2MP macro',
   'Battery': '5,000 mAh with 25W charging',
   'RAM': '4GB',
   'Storage': '128GB (expandable via microSD)',
   'Android Updates': 'Up to 6 years',
   'Price': '€210 (Europe)'}}}