In [12]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Saran\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [14]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re
import os

nltk.download('punkt')
nltk.download('stopwords')

def load_lexicon(file_path):
    encodings = ['utf-8', 'iso-8859-1', 'windows-1252']
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as file:
                return set(word.strip().lower() for word in file)
        except UnicodeDecodeError:
            continue
    raise ValueError(f"Unable to decode the file {file_path} with the attempted encodings")

def clean_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

def count_syllables(word):
    word = word.lower()
    count = 0
    vowels = 'aeiouy'
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le'):
        count += 1
    if count == 0:
        count += 1
    return count

def analyze_text(text, positive_words, negative_words, stop_words):
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    
    # Remove stop words
    words = [word for word in words if word.lower() not in stop_words]
    
    word_count = len(words)
    
    positive_score = sum(1 for word in words if word.lower() in positive_words)
    negative_score = sum(1 for word in words if word.lower() in negative_words)
    
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (word_count + 0.000001)
    
    avg_sentence_length = word_count / len(sentences)
    
    complex_words = [word for word in words if count_syllables(word) > 2]
    complex_word_count = len(complex_words)
    percentage_complex_words = complex_word_count / word_count
    
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    
    syllable_count = sum(count_syllables(word) for word in words)
    syllable_per_word = syllable_count / word_count
    
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.IGNORECASE))
    
    avg_word_length = sum(len(word) for word in words) / word_count
    
    return {
        'POSITIVE SCORE': positive_score,
        'NEGATIVE SCORE': negative_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_sentence_length,
        'COMPLEX WORD COUNT': complex_word_count,
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': syllable_per_word,
        'PERSONAL PRONOUNS': personal_pronouns,
        'AVG WORD LENGTH': avg_word_length
    }

def extract_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract title
    title = soup.find('h1').text.strip() if soup.find('h1') else ""
    
    # Extract article text (this may need to be adjusted based on the specific website structure)
    article_text = ""
    for paragraph in soup.find_all('p'):
        article_text += paragraph.text + "\n"
    
    return title + "\n\n" + article_text

def main():
    # Load input data
    input_data = pd.read_excel('Input.xlsx')
    
    # Load lexicons and stop words
    try:
        positive_words = load_lexicon('D:/project/dataextraction/MasterDictionary/positive-words.txt')
        negative_words = load_lexicon('D:/project/dataextraction/MasterDictionary/negative-words.txt')
    except ValueError as e:
        print(f"Error loading lexicon: {e}")
        return
    
    stop_words = set()
    for file in os.listdir('StopWords'):
        if file.endswith('.txt'):
            try:
                stop_words.update(load_lexicon(os.path.join('StopWords', file)))
            except ValueError as e:
                print(f"Error loading stop words file {file}: {e}")
    
    results = []
    
    for _, row in input_data.iterrows():
        url_id = row['URL_ID']
        url = row['URL']
        
        try:
            # Extract article text
            article_text = extract_article_text(url)
            
            # Save article text to file
            with open(f'{url_id}.txt', 'w', encoding='utf-8') as file:
                file.write(article_text)
            
            # Clean and analyze text
            cleaned_text = clean_text(article_text)
            analysis_results = analyze_text(cleaned_text, positive_words, negative_words, stop_words)
            
            # Combine input data with analysis results
            result = row.to_dict()
            result.update(analysis_results)
            results.append(result)
        except Exception as e:
            print(f"Error processing URL {url}: {e}")
    
    # Create output DataFrame
    output_df = pd.DataFrame(results)
    
    # Save output to Excel
    output_df.to_excel('New_Output_Data_Structure.xlsx', index=False)

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Saran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Saran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
