In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, opinion_lexicon
import re

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('opinion_lexicon')

# Function to extract article text from URL
def extract_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract article title
    title = soup.title.text
    
    # Extract article text
    article_text = ''
    for paragraph in soup.find_all('p'):
        article_text += paragraph.text + ' '
    
    return title, article_text.strip()

# Function to perform text analysis
def perform_text_analysis(article_text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(article_text)
    
    # Tokenize the text into words
    words = word_tokenize(article_text)
    
    # Define NLTK stop words
    stop_words = set(stopwords.words('english'))
    
    # Remove stop words
    filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]
    
    # Define positive and negative words using NLTK's opinion lexicon
    positive_words = set(opinion_lexicon.positive())
    negative_words = set(opinion_lexicon.negative())
    
    # Calculate Positive Score, Negative Score
    positive_score = sum(1 for word in filtered_words if word in positive_words)
    negative_score = sum(1 for word in filtered_words if word in negative_words)
    
    # Calculate Polarity Score, Subjectivity Score
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(filtered_words) + 0.000001)
    
    # Calculate Average Sentence Length
    avg_sentence_length = len(words) / len(sentences) if len(sentences) > 0 else 0
    
    # Calculate Percentage of Complex Words
    complex_words = [word for word in filtered_words if len(word) > 2]
    percentage_complex_words = len(complex_words) / len(filtered_words) if len(filtered_words) > 0 else 0
    
    # Calculate Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    
    # Calculate Average Number of Words Per Sentence
    avg_words_per_sentence = len(filtered_words) / len(sentences) if len(sentences) > 0 else 0
    
    # Calculate Complex Word Count
    complex_word_count = len(complex_words)
    
    # Calculate Word Count
    word_count = len(filtered_words)
    
    # Calculate Syllable Per Word
    def count_syllables(word):
        return max(1, len(re.findall(r'[aeiouy]+', word, re.IGNORECASE)))
    
    syllables_per_word = sum(count_syllables(word) for word in filtered_words) / len(filtered_words) if len(filtered_words) > 0 else 0
    
    # Calculate Personal Pronouns Count
    personal_pronouns_count = len(re.findall(r'\b(I|we|my|ours|us)\b', article_text, re.IGNORECASE))
    
    # Calculate Average Word Length
    avg_word_length = sum(len(word) for word in filtered_words) / len(filtered_words) if len(filtered_words) > 0 else 0
    
    return (positive_score, negative_score, polarity_score, subjectivity_score,
            avg_sentence_length, percentage_complex_words, fog_index,
            avg_words_per_sentence, complex_word_count, word_count,
            syllables_per_word, personal_pronouns_count, avg_word_length)

# Read URLs from input Excel file
input_data = pd.read_excel('Input.xlsx')

# Process each URL
output_data = []
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    # Extract article title and text
    title, article_text = extract_article_text(url)
    
    # Perform text analysis
    text_analysis_result = perform_text_analysis(article_text)
    
    # Prepare output data
    output_row = [url_id, url, *text_analysis_result]
    
    output_data.append(output_row)

# Define column names for output data
output_columns = ['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
                  'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE',
                  'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']

# Create DataFrame for output data
output_df = pd.DataFrame(output_data, columns=output_columns)

# Save output to Excel file
output_df.to_excel('Output_Data.xlsx', index=False)


[nltk_data] Downloading package punkt to C:\Users\AKASH
[nltk_data]     VERMA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\AKASH
[nltk_data]     VERMA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package opinion_lexicon to C:\Users\AKASH
[nltk_data]     VERMA\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
