In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
import os
import numpy as np
import time
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import random
from datetime import datetime
import shutil

In [5]:
class TextAnalyzer:
    def __init__(self):
        # Initialize session with retry mechanism
        self.session = requests.Session()
        retries = Retry(
            total=5,
            backoff_factor=1,
            status_forcelist=[500, 502, 503, 504, 404, 403],
            allowed_methods=["HEAD", "GET", "OPTIONS"]
        )
        self.session.mount('http://', HTTPAdapter(max_retries=retries))
        self.session.mount('https://', HTTPAdapter(max_retries=retries))
        
        # Create output directories
        self.output_dir = 'output_files'
        self.article_dir = os.path.join(self.output_dir, 'extracted_articles')
        self.create_directories()
        
        # Load stop words and sentiment words
        self.stop_words = self.load_stop_words()
        self.positive_words = self.load_word_list('positive-words.txt')
        self.negative_words = self.load_word_list('negative-words.txt')
        
        # Cache for failed URLs
        self.failed_urls = {}
    
    def create_directories(self):
        """Create necessary directories if they don't exist"""
        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(self.article_dir, exist_ok=True)

    def read_file_with_encoding(self, file_path):
        """Try different encodings to read the file"""
        encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']
        
        for encoding in encodings:
            try:
                with open(file_path, 'r', encoding=encoding) as f:
                    return f.read().lower()
            except (UnicodeDecodeError, UnicodeError):
                continue
        
        print(f"Warning: Could not read file {file_path} with any encoding")
        return ""

    def load_stop_words(self):
        stop_words = set()
        stop_word_files = [
            'StopWords_Auditor.txt',
            'StopWords_Currencies.txt',
            'StopWords_DatesandNumbers.txt',
            'StopWords_Generic.txt',
            'StopWords_GenericLong.txt',
            'StopWords_Geographic.txt',
            'StopWords_Names.txt'
        ]
        
        for file in stop_word_files:
            try:
                content = self.read_file_with_encoding(file)
                if content:
                    words = content.split()
                    stop_words.update(words)
            except FileNotFoundError:
                print(f"Warning: {file} not found")
        
        return stop_words | set(stopwords.words('english'))
    
    def load_word_list(self, filename):
        try:
            content = self.read_file_with_encoding(filename)
            if content:
                words = content.split()
                return set(word for word in words if word not in self.stop_words)
        except FileNotFoundError:
            print(f"Warning: {filename} not found")
            return set()

    def extract_article(self, url):
        try:
            if url in self.failed_urls:
                print(f"Skipping previously failed URL: {url}")
                return self.failed_urls[url]

            time.sleep(random.uniform(1, 3))
            
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.5',
                'DNT': '1',
                'Connection': 'keep-alive',
                'Upgrade-Insecure-Requests': '1',
            }
            
            response = self.session.get(url, headers=headers, timeout=30)
            response.raise_for_status()
            response.encoding = response.apparent_encoding
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            for script in soup(['script', 'style', 'meta', 'link']):
                script.decompose()
            
            title = ""
            title_tags = soup.find_all(['h1', 'title'])
            for tag in title_tags:
                if tag.text.strip():
                    title = tag.text.strip()
                    break
            
            article_text = ""
            article_containers = soup.find_all(['article', 'main', 'div'], 
                                            class_=['article', 'content', 'post', 'entry-content'])
            
            if article_containers:
                for container in article_containers:
                    paragraphs = container.find_all('p')
                    article_text += ' '.join(p.text.strip() for p in paragraphs)
            
            if not article_text:
                main_content = soup.find(['main', 'div'], 
                                       {'id': ['main-content', 'content', 'article']})
                if main_content:
                    paragraphs = main_content.find_all('p')
                    article_text += ' '.join(p.text.strip() for p in paragraphs)
            
            if not article_text:
                paragraphs = soup.find_all('p')
                article_text = ' '.join(p.text.strip() for p in paragraphs)
            
            if not article_text.strip():
                raise ValueError("No article content extracted")
            
            return title, article_text

        except Exception as e:
            error_msg = f"Error extracting article: {str(e)}"
            print(error_msg)
            self.failed_urls[url] = ("", "")
            return "", ""

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        words = word_tokenize(text)
        cleaned_words = [word for word in words if word not in self.stop_words]
        return ' '.join(cleaned_words)

    def count_syllables(self, word):
        word = word.lower()
        count = 0
        vowels = 'aeiouy'
        
        if word.endswith('es') or word.endswith('ed'):
            word = word[:-2]
        
        prev_char_is_vowel = False
        for char in word:
            is_vowel = char in vowels
            if is_vowel and not prev_char_is_vowel:
                count += 1
            prev_char_is_vowel = is_vowel
            
        return max(1, count)

    def analyze_text(self, text):
        if not text.strip():
            return None
            
        cleaned_text = self.clean_text(text)
        words = word_tokenize(cleaned_text)
        
        if not words:
            return None
            
        sentences = sent_tokenize(text)
        if not sentences:
            sentences = [text]
        
        positive_score = sum(1 for word in words if word in self.positive_words)
        negative_score = sum(1 for word in words if word in self.negative_words)
        
        polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
        subjectivity_score = (positive_score + negative_score) / (len(words) + 0.000001)
        
        avg_sentence_length = len(words) / len(sentences)
        complex_words = [word for word in words if self.count_syllables(word) > 2]
        percent_complex_words = len(complex_words) / len(words) if words else 0
        fog_index = 0.4 * (avg_sentence_length + percent_complex_words)
        
        syllable_count = sum(self.count_syllables(word) for word in words)
        syllable_per_word = syllable_count / len(words) if words else 0
        
        pronouns = re.findall(r'\b(i|we|my|ours|us)\b', cleaned_text.lower())
        pronoun_count = len([p for p in pronouns if p.lower() != 'us' or not re.search(r'\b(US|U\.S\.)\b', text)])
        
        avg_word_length = sum(len(word) for word in words) / len(words) if words else 0
        
        return {
            'POSITIVE_SCORE': positive_score,
            'NEGATIVE_SCORE': negative_score,
            'POLARITY_SCORE': polarity_score,
            'SUBJECTIVITY_SCORE': subjectivity_score,
            'AVG_SENTENCE_LENGTH': avg_sentence_length,
            'PERCENTAGE_OF_COMPLEX_WORDS': percent_complex_words * 100,
            'FOG_INDEX': fog_index,
            'AVG_NUMBER_OF_WORDS_PER_SENTENCE': avg_sentence_length,
            'COMPLEX_WORD_COUNT': len(complex_words),
            'WORD_COUNT': len(words),
            'SYLLABLE_PER_WORD': syllable_per_word,
            'PERSONAL_PRONOUNS': pronoun_count,
            'AVG_WORD_LENGTH': avg_word_length
        }

In [6]:
def safe_file_write(file_path, df, max_retries=5):
    """Safely write DataFrame to Excel with retries"""
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    base_path, ext = os.path.splitext(file_path)
    
    for attempt in range(max_retries):
        try:
            if attempt == 0:
                output_path = file_path
            else:
                output_path = f"{base_path}_{timestamp}{ext}"
            
            df.to_excel(output_path, index=False)
            print(f"Successfully wrote to {output_path}")
            return output_path
            
        except PermissionError as e:
            if attempt < max_retries - 1:
                print(f"Attempt {attempt + 1} failed: File is locked. Retrying with different filename...")
                time.sleep(1)
            else:
                raise Exception(f"Failed to write file after {max_retries} attempts: {str(e)}")

def main():
    try:
        analyzer = TextAnalyzer()
        
        try:
            df = pd.read_excel('Input.xlsx')
        except Exception as e:
            print(f"Error reading Input.xlsx: {str(e)}")
            return
        
        results = []
        failed_urls = []
        
        total_urls = len(df)
        print(f"Starting analysis of {total_urls} URLs...")
        
        for index, row in df.iterrows():
            url_id = row['URL_ID']
            url = row['URL']
            
            print(f"\nProcessing URL {index + 1}/{total_urls}: {url_id}")
            
            try:
                title, article_text = analyzer.extract_article(url)
                
                if title or article_text:
                    try:
                        output_path = os.path.join(analyzer.article_dir, f"{url_id}.txt")
                        with open(output_path, 'w', encoding='utf-8') as f:
                            f.write(f"Title: {title}\n\n{article_text}")
                    except Exception as e:
                        print(f"Error saving article {url_id}: {str(e)}")
                
                if article_text:
                    analysis = analyzer.analyze_text(article_text)
                    if analysis:
                        analysis['URL_ID'] = url_id
                        analysis['URL'] = url
                        results.append(analysis)
                else:
                    failed_urls.append({'URL_ID': url_id, 'URL': url, 'Reason': 'No content extracted'})
                    print(f"No content extracted for URL {url_id}")
            
            except Exception as e:
                failed_urls.append({'URL_ID': url_id, 'URL': url, 'Reason': str(e)})
                print(f"Error processing URL {url_id}: {str(e)}")
            
            time.sleep(random.uniform(1, 3))
        
        if results:
            output_df = pd.DataFrame(results)
            
            columns_order = ['URL_ID', 'URL', 'POSITIVE_SCORE', 'NEGATIVE_SCORE', 'POLARITY_SCORE',
                            'SUBJECTIVITY_SCORE', 'AVG_SENTENCE_LENGTH', 'PERCENTAGE_OF_COMPLEX_WORDS',
                            'FOG_INDEX', 'AVG_NUMBER_OF_WORDS_PER_SENTENCE', 'COMPLEX_WORD_COUNT',
                            'WORD_COUNT', 'SYLLABLE_PER_WORD', 'PERSONAL_PRONOUNS', 'AVG_WORD_LENGTH']
            
            output_df = output_df[columns_order]
            
            output_path = os.path.join(analyzer.output_dir, 'Output.xlsx')
            safe_file_write(output_path, output_df)
            
            print(f"\nAnalysis complete. Successfully processed {len(results)} out of {total_urls} URLs")
            
            if failed_urls:
                failed_df = pd.DataFrame(failed_urls)
                failed_path = os.path.join(analyzer.output_dir, 'Failed_URLs.xlsx')
                safe_file_write(failed_path, failed_df)
                print(f"Failed URLs saved to Failed_URLs.xlsx ({len(failed_urls)} failures)")
        else:
            print("\nNo results to save. Check if articles were extracted correctly.")
    
    except Exception as e:
        print(f"\nCritical error in main execution: {str(e)}")
        raise

In [7]:
if __name__ == "__main__":
    main()

Starting analysis of 147 URLs...

Processing URL 1/147: Netclan20241017

Processing URL 2/147: Netclan20241018

Processing URL 3/147: Netclan20241019

Processing URL 4/147: Netclan20241020

Processing URL 5/147: Netclan20241021

Processing URL 6/147: Netclan20241022

Processing URL 7/147: Netclan20241023

Processing URL 8/147: Netclan20241024

Processing URL 9/147: Netclan20241025

Processing URL 10/147: Netclan20241026

Processing URL 11/147: Netclan20241027

Processing URL 12/147: Netclan20241028

Processing URL 13/147: Netclan20241029

Processing URL 14/147: Netclan20241030

Processing URL 15/147: Netclan20241031

Processing URL 16/147: Netclan20241032

Processing URL 17/147: Netclan20241033

Processing URL 18/147: Netclan20241034

Processing URL 19/147: Netclan20241035

Processing URL 20/147: Netclan20241036

Processing URL 21/147: Netclan20241037

Processing URL 22/147: Netclan20241038

Processing URL 23/147: Netclan20241039

Processing URL 24/147: Netclan20241040

Processing URL 