In [7]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
# Importing necessary libraries
from bs4 import BeautifulSoup as bs
import requests
import csv
import re
from docx import Document
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize


In [12]:
# Download required NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

# Function to get the title and article content from a given link
def fetch_article(link):
    page = requests.get(link)
    soup = bs(page.text, 'html.parser')

    # Extract the title
    title = soup.find('h1', class_='article-title font-primary text-secondary-900 text-4xl lg:text-15xl font-bold leading-2xl lg:leading-15xl mb-3')
    if not title:
        title = soup.find('h1', class_='featured-template-title entry-title text-center text-2xl max-w-300 md:max-w-720 lg:text-15xl font-bold font-primary leading-2xl lg:leading-15xl mb-4 text-secondary-900 md:mx-4 lg:mx-0')
    title_text = title.get_text() if title else "No Title Found"

    # Extract the article content
    article = soup.find('div', class_="content-wrapper")
    article_content = article.get_text() if article else "No Content Found"
    
    return title_text, article_content

# Function to find hyperlinks in a cell
def find_hyperlinks(cell_value):
    return re.findall(r'(https?://[^\s]+)', cell_value)

# Function to find URL IDs in a cell
def find_url_ids(cell_value):
    return re.findall(r'Article[0-9]+', cell_value)

# Path to the Input CSV file
csv_file_path = "C:\\Users\\Rakshitha\\Automated Web Article Analysis\\Input.csv"

# Read the CSV file
with open(csv_file_path, mode='r', encoding='utf-8') as file:
    reader = csv.reader(file)
    data = list(reader)

# Function to find hyperlinks and URL IDs in a row
def find_links_and_ids(row):
    links, ids = [], []
    for cell in row:
        cell_links = find_hyperlinks(cell)
        cell_ids = find_url_ids(cell)
        if cell_links:
            links.extend(cell_links)
        if cell_ids:
            ids.extend(cell_ids)
    return links, ids

# Extract hyperlinks and URL IDs
all_hyperlinks, all_url_ids = [], []
for row in data:
    links, ids = find_links_and_ids(row)
    all_hyperlinks.extend(links)
    all_url_ids.extend(ids)

# Ensure the same length of hyperlinks and URL IDs
if len(all_hyperlinks) != len(all_url_ids):
    print("Mismatch between the number of hyperlinks and URL IDs")
else:
    docx_paths = []  # List to store paths of created docx files
    for link, url_id in zip(all_hyperlinks, all_url_ids):
        try:
            # Fetch the article text
            title, article_content = fetch_article(link)
        
            # Create a new Word document
            doc = Document()
            doc.add_heading(title, 0)
            doc.add_paragraph(article_content)
        
            # Save the document with the URL ID as the filename
            docx_path = f"{url_id}.docx"
            doc.save(docx_path)
            docx_paths.append(docx_path)

            print(f"Saved document for URL ID: {url_id}")
        except Exception as e:
            print(f"Error processing {link} with URL ID {url_id}: {e}")

print("All documents saved successfully.")

# Function to extract text from a docx file
def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    return ' '.join(paragraph.text for paragraph in doc.paragraphs)

# Function to remove stopwords from text
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    return ' '.join(word for word in words if word.lower() not in stop_words)

# Process each document
for docx_path in docx_paths:
    extracted_text = extract_text_from_docx(docx_path)
    cleaned_text = remove_stopwords(extracted_text)

# Function to load positive and negative sentiment dictionaries
def load_sentiment_dictionaries():
    positive_words, negative_words = set(), set()
    
    with open("C:\\Users\\Rakshitha\\Automated Web Article Analysis\\MasterDictionary\\positive-words.txt", 'r', encoding='latin-1') as f_pos:
        for line in f_pos:
            positive_words.add(line.strip().lower())
    
    with open("C:\\Users\\Rakshitha\\Automated Web Article Analysis\\MasterDictionary\\negative-words.txt", 'r', encoding='latin-1') as f_neg:
        for line in f_neg:
            negative_words.add(line.strip().lower())
    
    stop_words = set(stopwords.words('english'))
    return positive_words - stop_words, negative_words - stop_words

# Function to calculate sentiment scores
def calculate_scores(cleaned_text):
    positive_words, negative_words = load_sentiment_dictionaries()
    tokens = nltk.word_tokenize(cleaned_text.lower())
    
    positive_score = sum(1 for word in tokens if word in positive_words)
    negative_score = sum(1 for word in tokens if word in negative_words)
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(tokens) + 0.000001)
    
    return positive_score, negative_score, polarity_score, subjectivity_score

# Function to count syllables in a word
def count_syllables(word):
    word = word.lower()
    syllable_count = len(re.findall(r'[aeiouy]+', word))
    if word.endswith('e'):
        syllable_count -= 1
    if word.endswith (('es', 'ed')) and len(word) > 2:
        syllable_count -= 1
    return max(1, syllable_count)

# Function to count complex words
def count_complex_words(word_list):
    return sum(1 for word in word_list if count_syllables(word) > 2)

# Function to calculate readability metrics
def calculate_readability_metrics(text):
    stop_words = set(stopwords.words('english'))
    personal_pronouns = re.compile(r'\b(I|we|my|ours|us)\b', re.IGNORECASE)
    
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    cleaned_words = [word for word in words if word.isalpha() and word.lower() not in stop_words]
    
    num_sentences = len(sentences)
    num_words = len(cleaned_words)
    num_complex_words = count_complex_words(cleaned_words)
    syllable_counts = [count_syllables(word) for word in cleaned_words]
    personal_pronoun_count = len(personal_pronouns.findall(text))
    
    avg_sentence_length = num_words / num_sentences
    avg_words_per_sentence = num_words / num_sentences
    percentage_complex_words = (num_complex_words / num_words) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)  # Corrected line
    avg_word_length = sum(len(word) for word in cleaned_words) / num_words
    word_count = num_words
    
    return {
        "avg_sentence_length": avg_sentence_length,
        "avg_words_per_sentence": avg_words_per_sentence,
        "percentage_complex_words": percentage_complex_words,
        "fog_index": fog_index,
        "avg_word_length": avg_word_length,
        "word_count": word_count,
        "num_complex_words": num_complex_words,
        "syllable_counts": syllable_counts,
        "personal_pronoun_count": personal_pronoun_count
    }

# Function to save scores to a CSV file
def save_scores_to_csv(file_path, scores):
    columns = ['URL ID', 'URL', 'Positive Score', 'Negative Score', 'Polarity Score', 'Subjectivity Score',
               'Average Sentence Length', 'Average Words Per Sentence', 'Percentage of Complex Words',
               'Fog Index', 'Average Word Length', 'Word Count', 'Complex Word Count', 'Total Syllables',
               'Personal Pronoun Count']
    
    try:
        df = pd.read_csv(file_path)
    except (FileNotFoundError, pd.errors.EmptyDataError):
        df = pd.DataFrame(columns=columns)
    
    new_df = pd.DataFrame([scores], columns=columns)
    df = pd.concat([df, new_df], ignore_index=True)
    
    df.to_csv(file_path, index=False)
    print(f"Scores saved to {file_path}")

# Process each document and save scores
for docx_path in docx_paths:
    extracted_text = extract_text_from_docx(docx_path)
    cleaned_text = remove_stopwords(extracted_text)
    
    pos_score, neg_score, polarity_score, subjectivity_score = calculate_scores(cleaned_text)
    metrics = calculate_readability_metrics(cleaned_text)
    
    url_id = re.findall(r'Article[0-9]+', docx_path)[0]
    link = all_hyperlinks[all_url_ids.index(url_id)]
    readability_scores = list(metrics.values())
    scores = [url_id, link, pos_score, neg_score, polarity_score, subjectivity_score] + readability_scores
    
    csv_output_path = "C:\\Users\\Rakshitha\\Automated Web Article Analysis\\Output.csv"
    save_scores_to_csv(csv_output_path, scores)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rakshitha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rakshitha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Saved document for URL ID: Article1
Saved document for URL ID: Article2
Saved document for URL ID: Article3
Saved document for URL ID: Article4
Saved document for URL ID: Article5
Saved document for URL ID: Article6
Saved document for URL ID: Article7
Saved document for URL ID: Article8
Saved document for URL ID: Article9
Saved document for URL ID: Article10
Saved document for URL ID: Article11
Saved document for URL ID: Article12
Saved document for URL ID: Article13
Saved document for URL ID: Article14
Saved document for URL ID: Article15
All documents saved successfully.
Scores saved to C:\Users\Rakshitha\Automated Web Article Analysis\Output.csv
Scores saved to C:\Users\Rakshitha\Automated Web Article Analysis\Output.csv
Scores saved to C:\Users\Rakshitha\Automated Web Article Analysis\Output.csv
Scores saved to C:\Users\Rakshitha\Automated Web Article Analysis\Output.csv
Scores saved to C:\Users\Rakshitha\Automated Web Article Analysis\Output.csv
Scores saved to C:\Users\Rakshitha\