### Building a Web Data ETL Pipeline to collect and transform data from an online source into a usable format for analysis and storage.

In [5]:
# Imports
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AcerUser\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [7]:
# Extract text from a web article
class WebScraper:
    def __init__(self, url):
        self.url = url

    def extract_article_text(self):
        response = requests.get(self.url)
        html_content = response.content
        soup = BeautifulSoup(html_content, 'html.parser')
        article_text = soup.get_text()
        return article_text

In [9]:
# Clean and preprocess the data
class TextProcessor:
    def __init__(self, nltk_stopwords):
        self.nltk_stopwords = nltk_stopwords

    def tokenize_and_clean(self, text):
        words = text.split()
        filtered_words = [word.lower() for word in words if word.isalpha()
                            and word.lower() not in self.nltk_stopwords]
        return filtered_words

In [13]:
# Create an ETL pipeline
class ETLPipeline:
    def __init__(self, url):
        self.url = url
        self.nltk_stopwords = set(stopwords.words("english"))

    def run(self):
        scraper = WebScraper(self.url)
        article_text = scraper.extract_article_text()

        processor = TextProcessor(self.nltk_stopwords)
        filtered_words = processor.tokenize_and_clean(article_text)

        word_freq = Counter(filtered_words)
        df = pd.DataFrame(word_freq.items(), columns=['Words', 'Frequency'])
        df = df.sort_values(by='Frequency', ascending=False)
        return df

In [23]:
# Run the pipeline from a desired URL
if __name__ == "__main__":
    article_url = "https://en.wikipedia.org/wiki/Extract,_transform,_load"
    pipeline = ETLPipeline(article_url)
    result_df = pipeline.run()
    print(result_df.head())

      Words  Frequency
43     data        178
57      etl         61
102     may         40
0      load         24
166  source         23
