In [2]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.16.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.16.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.16.0 textstat-0.7.4


In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textstat import textstat
import re

# Download required nltk resources
nltk.download('vader_lexicon')
nltk.download('punkt')

# Function to calculate average sentence length and complex words
def text_statistics(text):
    sentences = nltk.sent_tokenize(text)
    words = nltk.word_tokenize(text)

    # Average sentence length
    avg_sentence_length = len(words) / len(sentences) if sentences else 0

    # Complex words are words with 3 or more syllables
    complex_words = [word for word in words if textstat.syllable_count(word) >= 3]
    percentage_of_complex_words = len(complex_words) / len(words) * 100 if words else 0

    return avg_sentence_length, percentage_of_complex_words, len(complex_words), len(words)

# Function to extract article text
def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        title = soup.find('h1').get_text(strip=True)
        paragraphs = soup.find_all('p')
        article_text = "\n".join([p.get_text(strip=True) for p in paragraphs])

        return title, article_text
    except Exception as e:
        print(f"Error extracting {url}: {e}")
        return None, None

# Function to calculate sentiment scores using VADER
def sentiment_analysis(text):
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    positive_score = sentiment_scores['pos']
    negative_score = sentiment_scores['neg']
    polarity_score = sentiment_scores['compound']
    return positive_score, negative_score, polarity_score

# Function to calculate subjectivity score using TextBlob (an alternative to manual method)
from textblob import TextBlob

def subjectivity_score(text):
    blob = TextBlob(text)
    return blob.sentiment.subjectivity

# Function to calculate Fog Index
def fog_index(text):
    return textstat.gunning_fog(text)

# Function to count personal pronouns
def count_personal_pronouns(text):
    pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text, re.I)
    return len(pronouns)

# Load the Excel file containing URLs
input_file = 'Input.xlsx'
df = pd.read_excel(input_file)

# Create a DataFrame for output results
output_columns = ['URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
                  'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS',
                  'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
                  'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']
output_df = pd.DataFrame(columns=output_columns)

# Initialize an empty list to collect the rows before creating a DataFrame
rows = []

# Loop through the URLs and extract data
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    title, article_text = extract_article_text(url)

    if title and article_text:
        # Perform sentiment analysis
        positive_score, negative_score, polarity_score = sentiment_analysis(article_text)

        # Calculate subjectivity score
        subj_score = subjectivity_score(article_text)

        # Calculate text metrics manually
        avg_sentence_length, percent_complex_words, complex_word_count, word_count = text_statistics(article_text)
        fog_idx = fog_index(article_text)  # Updated to pass the article text to the function
        syllable_per_word = textstat.syllable_count(article_text) / word_count if word_count else 0
        personal_pronouns = count_personal_pronouns(article_text)
        avg_word_length = sum(len(word) for word in nltk.word_tokenize(article_text)) / word_count if word_count else 0

        # Collect the results as a dictionary and add it to the list
        rows.append({
            'URL_ID': url_id, 'URL': url, 'POSITIVE SCORE': positive_score,
            'NEGATIVE SCORE': negative_score, 'POLARITY SCORE': polarity_score,
            'SUBJECTIVITY SCORE': subj_score, 'AVG SENTENCE LENGTH': avg_sentence_length,
            'PERCENTAGE OF COMPLEX WORDS': percent_complex_words, 'FOG INDEX': fog_idx,
            'AVG NUMBER OF WORDS PER SENTENCE': avg_sentence_length,
            'COMPLEX WORD COUNT': complex_word_count, 'WORD COUNT': word_count,
            'SYLLABLE PER WORD': syllable_per_word, 'PERSONAL PRONOUNS': personal_pronouns,
            'AVG WORD LENGTH': avg_word_length
        })

# Once all rows are collected, create a DataFrame
output_df = pd.DataFrame(rows)

# Save the output to an Excel file
output_file = 'output.xlsx'
output_df.to_excel(output_file, index=False)
print(f"Analysis complete. Results saved to {output_file}")


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Analysis complete. Results saved to output.xlsx
