In [9]:
!pip install pandas requests beautifulsoup4 nltk chardet -q

In [10]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import os
import chardet

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Smdas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Smdas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
class TextAnalyzer:
    def __init__(self):
        # Load positive and negative words with encoding detection
        self.positive_words = self.load_words('MasterDictionary/positive-words.txt')
        self.negative_words = self.load_words('MasterDictionary/negative-words.txt')
        self.stop_words = set(stopwords.words('english'))
        
    def detect_encoding(self, filepath):
        """Detect the encoding of a file using chardet"""
        with open(filepath, 'rb') as file:
            raw_data = file.read()
        return chardet.detect(raw_data)['encoding']
    
    def load_words(self, filepath):
        """Load words from file with automatic encoding detection and error handling"""
        try:
            # First try UTF-8
            with open(filepath, 'r', encoding='utf-8') as file:
                return set(file.read().splitlines())
        except UnicodeDecodeError:
            try:
                # Detect encoding and try again
                encoding = self.detect_encoding(filepath)
                with open(filepath, 'r', encoding=encoding) as file:
                    return set(file.read().splitlines())
            except Exception as e:
                print(f"Error loading {filepath}: {str(e)}")
                # Return empty set if file can't be read
                return set()
    
    def clean_text(self, text):
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        return text.lower()
    
    def get_word_count(self, text):
        words = word_tokenize(text)
        return len([word for word in words if word.lower() not in self.stop_words])
    
    def count_syllables(self, word):
        word = word.lower()
        count = 0
        vowels = 'aeiouy'
        
        if word.endswith('es') or word.endswith('ed'):
            word = word[:-2]
        
        prev_char_is_vowel = False
        for char in word:
            is_vowel = char in vowels
            if is_vowel and not prev_char_is_vowel:
                count += 1
            prev_char_is_vowel = is_vowel
            
        if word.endswith('e'):
            count -= 1
        if count == 0:
            count = 1
        return count
    
    def is_complex_word(self, word):
        return self.count_syllables(word) > 2
    
    def count_personal_pronouns(self, text):
        pronouns = r'\b(I|we|my|ours|us)\b'
        # Exclude 'US' when it refers to United States
        text = re.sub(r'\bUS\b', '', text)
        return len(re.findall(pronouns, text, re.IGNORECASE))
    
    def analyze_text(self, text):
        try:
            cleaned_text = self.clean_text(text)
            
            sentences = sent_tokenize(text)
            words = word_tokenize(cleaned_text)
            
            if not words or not sentences:
                return self.get_default_metrics()
            
            # Calculate scores
            positive_score = sum(1 for word in words if word in self.positive_words)
            negative_score = sum(1 for word in words if word in self.negative_words)
            
            denominator = (positive_score + negative_score) + 0.000001
            polarity_score = (positive_score - negative_score) / denominator
            
            # Word count (excluding stop words)
            word_count = self.get_word_count(cleaned_text)
            if word_count == 0:
                return self.get_default_metrics()
            
            subjectivity_score = (positive_score + negative_score) / (word_count + 0.000001)
            
            avg_sentence_length = word_count / len(sentences)
            
            complex_words = [word for word in words if self.is_complex_word(word)]
            complex_word_count = len(complex_words)
            
            percent_complex_words = complex_word_count / word_count
            
            fog_index = 0.4 * (avg_sentence_length + percent_complex_words)
            
            syllable_count = sum(self.count_syllables(word) for word in words)
            syllable_per_word = syllable_count / word_count
            
            personal_pronouns = self.count_personal_pronouns(text)
            
            avg_word_length = sum(len(word) for word in words) / word_count
            
            return {
                'POSITIVE SCORE': positive_score,
                'NEGATIVE SCORE': negative_score,
                'POLARITY SCORE': polarity_score,
                'SUBJECTIVITY SCORE': subjectivity_score,
                'AVG SENTENCE LENGTH': avg_sentence_length,
                'PERCENTAGE OF COMPLEX WORDS': percent_complex_words,
                'FOG INDEX': fog_index,
                'AVG NUMBER OF WORDS PER SENTENCE': avg_sentence_length,
                'COMPLEX WORD COUNT': complex_word_count,
                'WORD COUNT': word_count,
                'SYLLABLE PER WORD': syllable_per_word,
                'PERSONAL PRONOUNS': personal_pronouns,
                'AVG WORD LENGTH': avg_word_length
            }
        except Exception as e:
            print(f"Error analyzing text: {str(e)}")
            return self.get_default_metrics()
    
    def get_default_metrics(self):
        """Return default metrics for error cases"""
        return {
            'POSITIVE SCORE': 0,
            'NEGATIVE SCORE': 0,
            'POLARITY SCORE': 0,
            'SUBJECTIVITY SCORE': 0,
            'AVG SENTENCE LENGTH': 0,
            'PERCENTAGE OF COMPLEX WORDS': 0,
            'FOG INDEX': 0,
            'AVG NUMBER OF WORDS PER SENTENCE': 0,
            'COMPLEX WORD COUNT': 0,
            'WORD COUNT': 0,
            'SYLLABLE PER WORD': 0,
            'PERSONAL PRONOUNS': 0,
            'AVG WORD LENGTH': 0
        }

In [None]:
def scrape_article(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        title = soup.find('h1').text.strip() if soup.find('h1') else ''
        
        article_containers = [
            soup.find('article'),
            soup.find('div', class_='post-content'),
            soup.find('div', class_='article-content'),
            soup.find('div', class_='entry-content')
        ]
        
        article = next((container for container in article_containers if container is not None), None)
        
        if article:
            # Remove unwanted elements
            for element in article.find_all(['script', 'style', 'nav', 'header', 'footer']):
                element.decompose()
            
            paragraphs = article.find_all('p')
            text = ' '.join([p.text.strip() for p in paragraphs])
        else:
            text = ''
        
        return f"{title}\n\n{text}"
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return ""


In [None]:
def main():
    try:
        input_df = pd.read_excel('Input.xlsx')
        
        analyzer = TextAnalyzer()
        
        if not os.path.exists('extracted_articles'):
            os.makedirs('extracted_articles')
        
        results = []
        for _, row in input_df.iterrows():
            try:
                url_id = row['URL_ID']
                url = row['URL']
                print(f"Processing {url_id}: {url}")
                
                article_text = scrape_article(url)
                
                if article_text:
                    with open(f'extracted_articles/{url_id}.txt', 'w', encoding='utf-8') as f:
                        f.write(article_text)
                
                # Analyze text
                analysis = analyzer.analyze_text(article_text)
                analysis['URL_ID'] = url_id
                analysis['URL'] = url
                results.append(analysis)
                
            except Exception as e:
                print(f"Error processing {url_id}: {str(e)}")
                # Add empty results for failed URLs
                default_metrics = analyzer.get_default_metrics()
                default_metrics['URL_ID'] = url_id
                default_metrics['URL'] = url
                results.append(default_metrics)
        
        output_df = pd.DataFrame(results)
        
        columns = ['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 
                  'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS',
                  'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
                  'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']
        output_df = output_df[columns]
        
        output_df.to_excel('Output Data Structure.xlsx', index=False)
        print("Analysis completed successfully!")
        
    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

Processing Netclan20241017: https://insights.blackcoffer.com/ai-and-ml-based-youtube-analytics-and-content-creation-tool-for-optimizing-subscriber-engagement-and-content-strategy/
Processing Netclan20241018: https://insights.blackcoffer.com/enhancing-front-end-features-and-functionality-for-improved-user-experience-and-dashboard-accuracy-in-partner-hospital-application/
Processing Netclan20241019: https://insights.blackcoffer.com/roas-dashboard-for-campaign-wise-google-ads-budget-tracking-using-google-ads-ap/
Processing Netclan20241020: https://insights.blackcoffer.com/efficient-processing-and-analysis-of-financial-data-from-pdf-files-addressing-formatting-inconsistencies-and-ensuring-data-integrity-for-a-toyota-dealership-management-firm/
Processing Netclan20241021: https://insights.blackcoffer.com/development-of-ea-robot-for-automated-trading/
Processing Netclan20241022: https://insights.blackcoffer.com/ai-and-ml-based-youtube-analytics-and-content-creation-tool-for-optimizing-subscr