*ETL ---> Extract Transform Load*
ETL pipleline is a systematic process used in data engineering to collect, transform and load the data from various sources on the internet into a structured and usable format for analysis and storage

In [1]:
pip install beautifulsoup4 nltk



In [22]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
class WebScraper:
    def __init__(self, url):
        self.url = url
    def extract_text(self):
        response = requests.get(self.url)
        html_text = response.text
        soup = BeautifulSoup(html_text, 'html.parser')
        text = soup.get_text()
        return text

class TestProcessor:
    def __init__(self, nltk_stopwords):
      self.nltk_stopwords = nltk_stopwords
    def tokenize_text(self, text):
      words = text.split()
      filtered_words = [word.lower() for word in words if word.isalpha() and word.lower() not in self.nltk_stopwords]
      return filtered_words

class ETLPipeline:
   def __init__(self, url):
    self.url = url
    self.nltk_stopwords = set(stopwords.words('english'))

   def run(self):
    scraper = WebScraper(self.url)
    article_text = scraper.extract_text()
    test_processor = TestProcessor(self.nltk_stopwords)
    filtered_words = test_processor.tokenize_text(article_text)
    word_count = Counter(filtered_words)
    df = pd.DataFrame(word_count.items(), columns=['word', 'count'])
    df = df.sort_values(by='count', ascending=False)
    return df

if __name__ == '__main__':
    url = 'https://timesofindia.indiatimes.com/india'
    etl_pipeline = ETLPipeline(url)
    df = etl_pipeline.run()
    print(df)

            word  count
86          modi     10
0          india      9
85            pm      9
195           rs      8
10         kumbh      8
..           ...    ...
278       remark      1
277  proceedings      1
276    privilege      1
274   shivrajbjp      1
645  information      1

[646 rows x 2 columns]
