# Text Extraction and Analysis Pipeline

# Text Extraction and Analysis

This notebook implements the end-to-end pipeline  data extraction and data analysis:

1. **Data Extraction**: Read URLs from `Input.xlsx` and scrape article titles & texts.
2. **Text Analysis**: Compute 13 metrics (sentiment, readability, pronouns, etc.) per article.
3. **Output Generation**: Save results to `final_output.xlsx` following the specified structure.

---
```

In [23]:
# ## 1. Setup & Imports
import os
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
import nltk
import re
from nltk.tokenize import word_tokenize, sent_tokenize

def safe_word_tokenize(text):
    try:
        return word_tokenize(text)
    except LookupError:
        return re.findall(r'\b\w+\b', text)

def safe_sent_tokenize(text):
    try:
        return sent_tokenize(text)
    except LookupError:
        return re.split(r'[.!?]+', text)


# Download NLTK resources
nltk.download('punkt')

# Create necessary folders
os.makedirs('articles', exist_ok=True)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pdang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
input_df = pd.read_excel('Input.xlsx')  # Expecting columns: URL_ID, URL
input_df.head()

Unnamed: 0,URL_ID,URL
0,Netclan20241017,https://insights.blackcoffer.com/ai-and-ml-bas...
1,Netclan20241018,https://insights.blackcoffer.com/enhancing-fro...
2,Netclan20241019,https://insights.blackcoffer.com/roas-dashboar...
3,Netclan20241020,https://insights.blackcoffer.com/efficient-pro...
4,Netclan20241021,https://insights.blackcoffer.com/development-o...


In [25]:
# ## 3. Data Extraction Function

def extract_article(url):
    try:
        resp = requests.get(url, timeout=10)
        soup = BeautifulSoup(resp.content, 'html.parser')
        
        # Extracting title and article
        title_tag = soup.find('h1')
        title = title_tag.get_text(strip=True) if title_tag else ''
        paragraphs = soup.find_all('p')
        body = ' '.join(p.get_text(strip=True) for p in paragraphs)
        
        full_text = f"{title}\n\n{body}"
        
        # Debug preview
        print(f"\n[URL] {url}\n[EXTRACT] {full_text[:300]}...\n")
        
        return full_text
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ''


# Save each article as a text file
for _, row in input_df.iterrows():
    text = extract_article(row['URL'])
    with open(f"articles/{row['URL_ID']}.txt", 'w', encoding='utf-8') as f:
        f.write(text)


[URL] https://insights.blackcoffer.com/ai-and-ml-based-youtube-analytics-and-content-creation-tool-for-optimizing-subscriber-engagement-and-content-strategy/
[EXTRACT] AI and ML-Based YouTube Analytics and Content Creation Tool for Optimizing Subscriber Engagement and Content Strategy

Transforming Real Estate Investments with an Automated Stack shares Platform Empowering Careers: The Hirekingdom Integrating Machine Learning Code into Kubeflow Pipeline – Kuberflow...


[URL] https://insights.blackcoffer.com/enhancing-front-end-features-and-functionality-for-improved-user-experience-and-dashboard-accuracy-in-partner-hospital-application/
[EXTRACT] Enhancing Front-End Features and Functionality for Improved User Experience and Dashboard Accuracy in Partner Hospital Application

Transforming Real Estate Investments with an Automated Stack shares Platform Empowering Careers: The Hirekingdom Integrating Machine Learning Code into Kubeflow Pipelin...


[URL] https://insights.blackcoffer.com

In [26]:
def load_stopwords(folder='StopWords'):
    stopwords = set()
    for fname in os.listdir(folder):
        path = os.path.join(folder, fname)
        try:
            with open(path, 'r', encoding='utf-8') as f:
                stopwords |= set(f.read().lower().split())
        except UnicodeDecodeError:
            with open(path, 'r', encoding='ISO-8859-1') as f:
                stopwords |= set(f.read().lower().split())
    return stopwords


def load_master_dict(folder='MasterDictionary'):
    with open(os.path.join(folder, 'positive-words.txt'), encoding='utf-8', errors='ignore') as f:
        positive = set(f.read().lower().split())
    with open(os.path.join(folder, 'negative-words.txt'), encoding='utf-8', errors='ignore') as f:
        negative = set(f.read().lower().split())
    return positive, negative

stop_words = load_stopwords()
positive_words, negative_words = load_master_dict()

In [27]:
def count_syllables(word):
    word = word.lower()
    syllables = len(re.findall(r'[aeiouy]+', word))
    if word.endswith(('es', 'ed')) and len(word) > 2:
        syllables -= 1
    return max(syllables, 1)


In [28]:
def analyze_text(text):
    try:
        tokens = safe_word_tokenize(text.lower())
        words = [re.sub(r"\W+", "", w) for w in tokens if w.isalpha()]
        words = [w for w in words if w not in stop_words]
        sentences = safe_sent_tokenize(text)
        sentences = [s for s in sentences if s.strip()]

        if not words or not sentences:
            raise ValueError(f"Empty words or sentences. Raw text: {text[:200]}")

        # (rest of the logic stays the same)


        # Step 3: Sentiment
        pos_score = sum(w in positive_words for w in words)
        neg_score = sum(w in negative_words for w in words)
        polarity = (pos_score - neg_score) / ((pos_score + neg_score) + 1e-6)
        subjectivity = (pos_score + neg_score) / (len(words) + 1e-6)

        # Step 4: Readability
        avg_sent_len = len(words) / len(sentences)
        complex_words = [w for w in words if count_syllables(w) > 2]
        percent_complex = len(complex_words) / len(words)
        fog_index = 0.4 * (avg_sent_len + percent_complex)
        avg_words_per_sentence = avg_sent_len
        complex_word_count = len(complex_words)

        # Step 5: Other Metrics
        word_count = len(words)
        syllables_per_word = sum(count_syllables(w) for w in words) / word_count
        pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, flags=re.I))
        avg_word_length = sum(len(w) for w in words) / word_count

        return [
            pos_score, neg_score, polarity, subjectivity,
            avg_sent_len, percent_complex, fog_index, avg_words_per_sentence,
            complex_word_count, word_count, syllables_per_word, pronouns, avg_word_length
        ]
    
    except Exception as e:
        print(f"Error in analyze_text: {e}")
        return [None] * 13


In [29]:
# ## 6. Generate Output

import warnings
warnings.filterwarnings("ignore")

# Prepare output DataFrame
output_df = input_df.copy()
cols = [
    'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
    'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE',
    'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'
]
for c in cols:
    output_df[c] = None  # safer than 0.0 for missing/invalid entries

# Analyze each article
for idx, row in output_df.iterrows():
    file_path = f"articles/{row['URL_ID']}.txt"
    if os.path.isfile(file_path):
        try:
            with open(file_path, encoding='utf-8') as f:
                text = f.read()
            metrics = analyze_text(text)
            for i, c in enumerate(cols):
                output_df.at[idx, c] = metrics[i]
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    else:
        print(f"Missing: {file_path}")

# Save final results
output_df.to_excel('final_output.xlsx', index=False)
output_df.head()


Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,Netclan20241017,https://insights.blackcoffer.com/ai-and-ml-bas...,8,0,1.0,0.032922,14.294118,0.514403,5.923408,14.294118,125,243,2.625514,3,7.781893
1,Netclan20241018,https://insights.blackcoffer.com/enhancing-fro...,15,7,0.363636,0.043393,10.787234,0.398422,4.474262,10.787234,202,507,2.416174,9,7.195266
2,Netclan20241019,https://insights.blackcoffer.com/roas-dashboar...,13,2,0.733333,0.053571,14.736842,0.414286,6.060451,14.736842,116,280,2.492857,3,7.4
3,Netclan20241020,https://insights.blackcoffer.com/efficient-pro...,26,11,0.405405,0.074899,13.351351,0.544534,5.558354,13.351351,269,494,2.718623,6,7.98583
4,Netclan20241021,https://insights.blackcoffer.com/development-o...,5,0,1.0,0.021097,13.166667,0.49789,5.465823,13.166667,118,237,2.662447,3,7.637131
