In [10]:
# All the necessary dependencies for this program
!pip install newspaper3k lxml_html_clean nltk textstat




In [11]:
# all the necessary modules
from newspaper import Article
import nltk
import re
import textstat
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from collections import Counter

import pandas as pd

nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:

# All the necessary sub-functions

def load_words(file_path):
    with open(file_path, "r", encoding="latin-1") as file:
        return set(file.read().split())
def sentiment_analysis(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]

    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)

    negative_score = abs(negative_score)

    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(words) + 0.000001)

    return {
        "POSITIVE SCORE": positive_score,
        "NEGATIVE SCORE": negative_score,
        "POLARITY Score": round(polarity_score, 4),
        "SUBJECTIVITY Score": round(subjectivity_score, 4),
    }

def readability_analysis(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    words_cleaned = [word for word in words if word.isalnum() and word.lower() not in stop_words]

    avg_sentence_length = sum(len(sentence) for sentence in sentences) / len(sentences) if sentences else 0
    complex_words = [word for word in words_cleaned if textstat.syllable_count(word) > 2]
    complex_word_percentage = len(complex_words) / len(words_cleaned) if words_cleaned else 0

    fog_index = 0.4 * (avg_sentence_length + complex_word_percentage)

    return {
        "AVG SENTENCE LENGTH": round(avg_sentence_length, 2),
        "PERCENTAGE OF COMPLEX WORDS": round(complex_word_percentage * 100, 2),
        "FOG INDEX": round(fog_index, 2),
        "AVG NUMBER OF WORDS PER SENTENCE": round(len(words_cleaned) / len(sentences) if sentences else 0, 2),
        "COMPLEX WORD COUNT": len(complex_words),
    }


def word_analysis(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    words_cleaned = [word for word in words if word.isalnum() and word.lower() not in stop_words]

    total_words = len(words_cleaned)
    total_chars = sum(len(word) for word in words_cleaned)
    avg_word_length = total_chars / total_words if total_words else 0
    avg_words_per_sentence = total_words / len(sentences) if sentences else 0

    return {
        "WORD COUNT": total_words,
        "AVG WORD LENGTH": round(avg_word_length, 2),
    }
def personal_pronouns(text):
    pronouns = re.findall(r"\b(I|we|my|ours|us)\b", text, re.I)
    return {"PERSONAL PRONOUNS": len(pronouns)}

def cleanText(text):
    lines = text.split("\n")
    for i, line in enumerate(lines):
        if line.strip().lower().startswith("summarize"):
            return "\n".join(lines[:i])
    return text

def count_syllables(word):
    word = word.lower()
    vowels = "aeiou"

    syllable_count = len(re.findall(r'[aeiouy]+', word))

    if word.endswith(("es", "ed")) and len(word) > 2:
        syllable_count -= 1

    return max(1, syllable_count)


def calculate_syllable_per_word(text):
    words = re.findall(r'\b\w+\b', text)
    total_syllables = sum(count_syllables(word) for word in words)
    total_words = len(words)
    res =  total_syllables / total_words if total_words > 0 else 0
    return {"SYLLABLE PER WORD": res }



In [13]:
# Loading URLs and helper files

positive_words = load_words("/content/positive-words.txt") #File address for positive words
negative_words = load_words("/content/negative-words.txt") #File address for negative words
stop_words = set(stopwords.words("english"))


df = pd.read_excel("/content/Output Data Structure.xlsx") #File address for input file

In [14]:
# Loop over all the URLs and storing data in df

for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    try:
        article = Article(url)
        article.download()
        article.parse()

        clean_text = article.title + cleanText(article.text)

        sentiment_results = sentiment_analysis(clean_text)
        readability_results = readability_analysis(clean_text)
        word_results = word_analysis(clean_text)
        pronoun_results = personal_pronouns(clean_text)
        syllable_results = calculate_syllable_per_word(clean_text)

        final_results = {**sentiment_results, **readability_results, **word_results, **pronoun_results, **syllable_results}

        for key in final_results:
            df.at[index, key.upper()] = final_results[key]

    except Exception as e:
        print(f"Failed to process {url_id}: {e}")

In [16]:
# Converting df to excel file

df.to_excel("output.xlsx", index=False)