In [8]:
import pandas as pd
import requests
from readability import Document
import os

# Function to extract article text from URL
def extract_article_text(url):
    try:
        # Fetch webpage content
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise HTTPError for bad responses
        
        # Parse webpage content with readability
        doc = Document(response.text)
        article_title = doc.short_title()
        article_text = doc.summary()
        
        return article_title, article_text
    except requests.RequestException as e:
        return None, f"RequestException occurred while fetching {url}: {e}"
    except Exception as e:
        return None, f"Error occurred while processing {url}: {e}"

# Read the URLs and URL IDs from the Excel file
df = pd.read_excel('Input.xlsx')

# Create a directory to store the text files
if not os.path.exists('article_texts'):
    os.makedirs('article_texts')

# Iterate through URLs
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    # Extract article text
    article_title, article_text = extract_article_text(url)
    
    if article_title and article_text:
        # Save extracted article into a text file
        with open(f'article_texts/{url_id}.txt', 'w', encoding='utf-8') as f:
            f.write(f'Title: {article_title}\n\n')
            f.write(article_text)
            
        print(f"Article extracted and saved for {url_id}")
    else:
        print(f"Failed to extract article for {url_id}. Error: {article_text}")

print("Extraction completed.")


Article extracted and saved for blackassign0001
Article extracted and saved for blackassign0002
Article extracted and saved for blackassign0003
Article extracted and saved for blackassign0004
Article extracted and saved for blackassign0005
Article extracted and saved for blackassign0006
Article extracted and saved for blackassign0007
Article extracted and saved for blackassign0008
Article extracted and saved for blackassign0009
Article extracted and saved for blackassign0010
Article extracted and saved for blackassign0011
Article extracted and saved for blackassign0012
Article extracted and saved for blackassign0013
Article extracted and saved for blackassign0014
Article extracted and saved for blackassign0015
Article extracted and saved for blackassign0016
Article extracted and saved for blackassign0017
Article extracted and saved for blackassign0018
Article extracted and saved for blackassign0019
Article extracted and saved for blackassign0020
Article extracted and saved for blackass