# Take news content

In [1]:
import pandas as pd

In [2]:
df = pd.read_json('../datasets/News_Category_Dataset_v3.json', lines=True)

FileNotFoundError: File ../datasets/News_Category_Dataset_v3.json does not exist

In [4]:
df.shape

(209527, 6)

In [None]:
import os
import time
import pandas as pd
import concurrent.futures
import csv
from newspaper import Article

def process_article(idx, url, language="en"):
    """
    Funcție care procesează un articol dat indexul și URL-ul.
    Returnează o tuplă (idx, text), unde text este conținutul extras
    sau un mesaj de eroare.
    """
    try:
        article = Article(url, language=language)
        article.download()
        article.parse()
        text = article.text
    except Exception as e:
        text = f"Eroare: {e}"
    return idx, text

def load_resume_index(last_index_file):
    try:
        with open(last_index_file, "r") as f:
            return int(f.read().strip())
    except Exception:
        return 0

def save_resume_index(last_index_file, index):
    with open(last_index_file, "w") as f:
        f.write(str(index))

def main():
    input_json = "../../datasets/News_Category_Dataset_v3.json"  
    output_csv = "news_articles_content_full.csv"             
    last_index_file = "last_index.txt"                         

    df = pd.read_json(input_json, lines=True)
    df = df.reset_index(drop=True)
    
    total_articles = len(df)
    resume_index = load_resume_index(last_index_file)
    print(f"Reluăm procesarea de la index: {resume_index} din {total_articles} articole.")

    if resume_index == 0:
        mode = "w"
    else:
        mode = "a"
    
    checkpoint = 1000         
    progress_interval = 100   
    max_workers = 30           
    
    with open(output_csv, mode, encoding="utf-8", newline="") as fout:
        writer = csv.writer(fout)
        if resume_index == 0:
            writer.writerow(["category", "content"])
        
        for batch_start in range(resume_index, total_articles, checkpoint):
            batch_end = min(batch_start + checkpoint, total_articles)
            batch_indices = list(range(batch_start, batch_end))
            
            tasks = [(idx, df.at[idx, "link"]) for idx in batch_indices]
            results = []
            
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                
                future_to_idx = {executor.submit(process_article, idx, url): idx for idx, url in tasks}
                for future in concurrent.futures.as_completed(future_to_idx):
                    idx_processed, text = future.result()
                    results.append((idx_processed, text))
                    
                    if (idx_processed + 1) % progress_interval == 0:
                        print(f"Processing article {idx_processed + 1}: {df.at[idx_processed, 'link'][:50]}...")
            
            results.sort(key=lambda x: x[0])
            
            for idx_processed, text in results:
                category = df.at[idx_processed, "category"]
                category_esc = category.replace('"', '""')
                text_esc = text.replace('"', '""')
                writer.writerow([category_esc, text_esc])
            
            fout.flush()
            save_resume_index(last_index_file, batch_end)
            print(f"Checkpoint: {batch_end} articole procesate. Salvare completă pentru acest batch.")
            time.sleep(0.3)  

    save_resume_index(last_index_file, total_articles)
    print(f"Procesare completă! Articole totale: {total_articles}")

if __name__ == "__main__":
    main()


Reluăm procesarea de la index: 1000 din 209527 articole.
Processing article 1100: https://www.huffpost.com/entry/capitol-insurrectio...
Processing article 1200: https://www.huffpost.com/entry/price-is-right-wron...
Processing article 1300: https://www.huffpost.com/entry/josh-hawley-democra...
Processing article 1400: https://www.huffpost.com/entry/betty-white-dead-di...
Processing article 1500: https://www.huffpost.com/entry/cris-collinsworth-a...
Processing article 1600: https://www.huffpost.com/entry/ap-us-hoffa-search-...
Processing article 1700: https://www.huffpost.com/entry/tension-rising-iraq...
Processing article 1800: https://www.huffpost.com/entry/best-halloween-cock...
Processing article 1900: https://www.huffpost.com/entry/bomb-kabul-mosque-k...
Processing article 2000: https://www.huffpost.com/entry/san-francisco-train...
Checkpoint: 2000 articole procesate. Salvare completă pentru acest batch.
Processing article 2100: https://www.huffpost.com/entry/prisons-and-jails-n...


In [None]:
import os
import time
import pandas as pd
import concurrent.futures
import csv
import requests
from bs4 import BeautifulSoup

def scrape_article_bs(url, timeout=10, max_retries=5):
   
    retries = 0
    delay = 1  
    while retries < max_retries:
        try:
            r = requests.get(url, timeout=timeout)
            if r.status_code == 429:
                raise Exception("HTTP 429")
            if r.status_code != 200:
                return f"Eroare: HTTP {r.status_code}"
            soup = BeautifulSoup(r.content, "html.parser")
            for tag in soup(["script", "style", "aside", "nav"]):
                tag.decompose()
            paragraphs = soup.find_all("p")
            text_parts = []
            for p in paragraphs:
                text = p.get_text(strip=True)
                if "Advertisement" in text or "LOADING ERROR LOADING" in text:
                    continue
                if text:
                    text_parts.append(text)
            article_text = " ".join(text_parts)
            return article_text
        except Exception as e:
            if "HTTP 429" in str(e):
                print(f"HTTP 429 encountered at {url}. Waiting for {delay} seconds before retrying...")
                time.sleep(delay)
                retries += 1
                delay *= 2  
            else:
                return f"Eroare: {e}"
    return "Eroare: Max retries atins (HTTP 429)"

def process_article_bs(idx, url):
    
    text = scrape_article_bs(url)
    return idx, text

def load_resume_index(last_index_file):
    try:
        with open(last_index_file, "r") as f:
            return int(f.read().strip())
    except Exception:
        return 0

def save_resume_index(last_index_file, index):
    with open(last_index_file, "w") as f:
        f.write(str(index))

def main():
    input_json = "../../datasets/News_Category_Dataset_v3.json" 
    output_csv = "news_articles_content_full_bs.csv"          
    last_index_file = "last_index_bs.txt"                       
    
    df = pd.read_json(input_json, lines=True)
    df = df.reset_index(drop=True)
    total_articles = len(df)
    
    resume_index = load_resume_index(last_index_file)
    print(f"Reluăm procesarea de la index: {resume_index} din {total_articles} articole.")
    
    if resume_index == 0:
        mode = "w"
    else:
        mode = "a"
    
    checkpoint = 1000         
    progress_interval = 100   
    max_workers = 2          
    
    with open(output_csv, mode, encoding="utf-8", newline="") as fout:
        writer = csv.writer(fout)
        if resume_index == 0:
            writer.writerow(["category", "content"])
        
        for batch_start in range(resume_index, total_articles, checkpoint):
            batch_end = min(batch_start + checkpoint, total_articles)
            batch_indices = list(range(batch_start, batch_end))
            
            tasks = [(idx, df.at[idx, "link"]) for idx in batch_indices]
            results = []
            
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                future_to_idx = {executor.submit(process_article_bs, idx, url): idx for idx, url in tasks}
                for future in concurrent.futures.as_completed(future_to_idx):
                    idx_processed, text = future.result()
                    results.append((idx_processed, text))
                    if (idx_processed + 1) % progress_interval == 0:
                        print(f"Processing article {idx_processed+1}: {df.at[idx_processed, 'link'][:50]}...")
            
            results.sort(key=lambda x: x[0])
            for idx_processed, text in results:
                category = df.at[idx_processed, "category"]
                cat_esc = category.replace('"', '""')
                text_esc = text.replace('"', '""')
                writer.writerow([cat_esc, text_esc])
            
            fout.flush()  
            save_resume_index(last_index_file, batch_end)
            print(f"Checkpoint: {batch_end} articole procesate. Salvare completă pentru acest batch.")
            time.sleep(5)  
    
    save_resume_index(last_index_file, total_articles)
    print(f"Procesare completă! Articole totale: {total_articles}")

if __name__ == "__main__":
    main()


Reluăm procesarea de la index: 207901 din 209527 articole.
Processing article 208000: https://www.huffingtonpost.com/entry/clothes-washe...
Processing article 208100: https://www.huffingtonpost.com/entry/alexander-wan...
Processing article 208200: https://www.huffingtonpost.com/entry/sitting-healt...
Processing article 208300: https://www.huffingtonpost.com/entry/chloe-moretz-...
Processing article 208400: https://www.huffingtonpost.comhttp://online.wsj.co...
Processing article 208500: https://www.huffingtonpost.com/entry/molly-sims-ba...
Processing article 208600: https://www.huffingtonpost.com/entry/elizabeth-wei...
Processing article 208700: https://www.huffingtonpost.comhttp://www.realsimpl...
Processing article 208800: https://www.huffingtonpost.com/entry/know-thine-en...
Processing article 208900: https://www.huffingtonpost.com/entry/mens-accessor...
Checkpoint: 208901 articole procesate. Salvare completă pentru acest batch.
Processing article 209000: https://www.huffingtonpost.c