In [None]:
import os
import re
import nltk
import requests
import pandas as pd
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
from google.colab import files
import zipfile
import shutil



os: Used for interacting with the operating system, such as reading


files and directories.

re: Provides regular expression support for pattern matching in text (e.g., finding personal pronouns).

nltk: Natural Language Toolkit for text processing tasks like tokenization and stopword removal.

requests: Used to send HTTP requests to fetch webpage content.

pandas: For data manipulation and handling Excel files.

BeautifulSoup: A library for parsing HTML and extracting specific content from web pages.

word_tokenize, sent_tokenize: Functions from NLTK to split text into words and sentences.

google.colab: Used for file uploads in Google Colab (optional if running locally).

zipfile: For extracting ZIP files.

shutil: For file operations like moving or copying files.

In [None]:
# Download required NLTK data
nltk.download('punkt')  # Ensure punkt resource is downloaded
nltk.download('stopwords')  # Ensure stopwords resource is downloaded

nltk.download('punkt_tab')  # Fixes the LookupError
nltk.data.path.append('/usr/local/nltk_data')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


nltk.download('punkt'): Downloads the punkt tokenizer models for word and sentence tokenization.

nltk.download('punkt_tab'): Downloads additional tokenizer tables to fix the LookupError.

nltk.data.path.append(...): Adds a custom path for NLTK data files.

In [None]:
# Upload StopWords and MasterDictionary folders manually in Colab
print("Please upload the StopWords and MasterDictionary folders as a ZIP file")
uploaded = files.upload()


Please upload the StopWords and MasterDictionary folders as a ZIP file


Saving MasterDictionary-20250305T025258Z-001.zip to MasterDictionary-20250305T025258Z-001 (1).zip
Saving StopWords-20250305T024631Z-001.zip to StopWords-20250305T024631Z-001 (1).zip


files.upload(): Prompts the user to upload ZIP files in Google Colab.

shutil.unpack_archive(): Extracts the uploaded ZIP files into the current directory.

In [None]:
# Extract uploaded files
for filename in uploaded.keys():
    shutil.unpack_archive(filename, './')

stopwords = set(): Initializes an empty set to store stopwords.

os.listdir(stopwords_path): Lists all files in the StopWords folder.

with open(...): Reads each file and adds its words to the stopwords set.

stopwords.update(...): Updates the set with words from the file.

In [None]:

# Load Stopwords
stopwords = set()
stopwords_path = "StopWords"
for filename in os.listdir(stopwords_path):
    try:
        with open(os.path.join(stopwords_path, filename), 'r', encoding='utf-8', errors='ignore') as f:
            stopwords.update(f.read().split())
    except Exception as e:
        print(f"Error reading {filename}: {e}")


positive_words and negative_words: Sets to store positive and negative words.

with open(...): Reads the positive and negative word files and updates the respective sets.

In [None]:
# Load Master Dictionary
positive_words = set()
negative_words = set()
try:
    with open("MasterDictionary/positive-words.txt", 'r', encoding='utf-8', errors='ignore') as f:
        positive_words.update(f.read().split())
    with open("MasterDictionary/negative-words.txt", 'r', encoding='utf-8', errors='ignore') as f:
        negative_words.update(f.read().split())
except Exception as e:
    print(f"Error reading Master Dictionary: {e}")

files.upload(): Prompts the user to upload Input.xlsx.

pd.read_excel(...): Reads the Excel file into a Pandas DataFrame.

In [None]:
# Upload Input.xlsx manually in Colab
print("Please upload Input.xlsx")
uploaded = files.upload()

Please upload Input.xlsx


Saving Input.xlsx to Input.xlsx


In [None]:
# Load input URLs
input_df = pd.read_excel("Input.xlsx")

In [None]:
def extract_text(url):
    """Extract article title and text from URL."""
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.find('h1').get_text(strip=True) if soup.find('h1') else ""
        paragraphs = soup.find_all('p')
        article_text = ' '.join([p.get_text(strip=True) for p in paragraphs])
        return title + "\n" + article_text
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return ""

requests.get(url): Fetches the webpage content.

BeautifulSoup(...): Parses the HTML content.

soup.find('h1'): Extracts the article title.

soup.find_all('p'): Extracts all paragraphs.

article_text: Combines the title and paragraphs into a single string.

In [None]:
def clean_text(text):
    """Remove stopwords and special characters."""
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stopwords]
    return words

def analyze_sentiment(words):
    """Calculate sentiment scores."""
    pos_score = sum(1 for word in words if word in positive_words)
    neg_score = sum(1 for word in words if word in negative_words)
    polarity = (pos_score - neg_score) / ((pos_score + neg_score) + 0.000001)
    subjectivity = (pos_score + neg_score) / (len(words) + 0.000001)
    return pos_score, neg_score, polarity, subjectivity

def compute_readability(text):
    """Calculate readability metrics."""
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    avg_sentence_length = len(words) / (len(sentences) + 0.000001)
    complex_words = [word for word in words if sum(1 for c in word if c in 'aeiou') > 2]
    percentage_complex = len(complex_words) / (len(words) + 0.000001)
    fog_index = 0.4 * (avg_sentence_length + percentage_complex)
    return avg_sentence_length, percentage_complex, fog_index, len(complex_words), len(words)

def extract_personal_pronouns(text):
    """Count occurrences of personal pronouns."""
    pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text, re.I)
    return len(pronouns)

word_tokenize(...): Splits the text into individual words.

text.lower(): Converts text to lowercase for case-insensitive comparison.

word.isalnum(): Filters out non-alphanumeric words (e.g., punctuation).

word not in stopwords: Removes stopwords.

pos_score: Counts positive words.

neg_score: Counts negative words.

polarity: Calculates the polarity score (ranges from -1 to +1).

subjectivity: Calculates the subjectivity score (ranges from 0 to +1).

sent_tokenize(...): Splits text into sentences.

word_tokenize(...): Splits text into words.

avg_sentence_length: Calculates the average number of words per sentence.

complex_words: Identifies words with more than two syllables.

fog_index: Computes the Gunning Fog Index for readability.

re.findall(...): Uses regex to find personal pronouns (I, we, my, ours, us).

re.I: Makes the search case-insensitive.

In [None]:
def analyze_articles():
    """Process each URL and analyze the extracted text."""
    results = []
    for _, row in input_df.iterrows():
        url_id, url = row['URL_ID'], row['URL']
        text = extract_text(url)
        if text:
            words = clean_text(text)
            pos_score, neg_score, polarity, subjectivity = analyze_sentiment(words)
            avg_sent_length, perc_complex, fog, complex_count, word_count = compute_readability(text)
            pronoun_count = extract_personal_pronouns(text)
            avg_word_length = sum(len(word) for word in words) / (len(words) + 0.000001)
            results.append([url_id, url, pos_score, neg_score, polarity, subjectivity, avg_sent_length,
                            perc_complex, fog, complex_count, word_count, pronoun_count, avg_word_length])
    return results

results: Stores the computed metrics for each URL.

input_df.iterrows(): Iterates over each row in the input DataFrame.

extract_text(url): Extracts article text.

clean_text(text): Cleans the text.

analyze_sentiment(words): Computes sentiment scores.

compute_readability(text): Computes readability metrics.

extract_personal_pronouns(text): Counts personal pronouns.

avg_word_length: Calculates the average word length.

In [None]:
# Run analysis and save to Excel
output_df = pd.DataFrame(analyze_articles(), columns=[
    'URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
    'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'COMPLEX WORD COUNT',
    'WORD COUNT', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'
])
output_df.to_excel("Output.xlsx", index=False)

pd.DataFrame(...): Creates a DataFrame from the results.

output_df.to_excel(...): Saves the DataFrame to an Excel file.