In [6]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict, stopwords
import re

# Function to count syllables in a word using the CMU Pronouncing Dictionary
def syllable_count(word):
    if word.lower() in cmudictionary:
        return max([len(list(y for y in x if y[-1].isdigit())) for x in cmudictionary[word.lower()]])
    else:
        vowels = "aeiouAEIOU"
        return sum(1 for char in word if char in vowels)

# Load CMU Pronouncing Dictionary for syllable count
cmudictionary = cmudict.dict()

# Define the output file path
output_file_path = "Output_Data_Structure.xlsx"

# Load stop words from the provided folders
stop_words_folder = "StopWords"
stop_words = set()

for filename in os.listdir(stop_words_folder):
    with open(os.path.join(stop_words_folder, filename), 'r', encoding='latin-1') as file:
        stop_words.update(file.read().splitlines())

# Directories
text_dir = "ArticleTexts"
stopwords_dir = "StopWords"
sentiment_dir = "MasterDictionary"

# Create the directories if they don't exist
for directory in [text_dir, stopwords_dir, sentiment_dir]:
    if not os.path.exists(directory):
        os.makedirs(directory)

# Load positive and negative words from the sentiment directory
pos = set()
neg = set()

for files in os.listdir(sentiment_dir):
    if files == 'positive-words.txt':
        with open(os.path.join(sentiment_dir, files), 'r', encoding='ISO-8859-1') as f:
            pos.update(f.read().splitlines())
    else:
        with open(os.path.join(sentiment_dir, files), 'r', encoding='ISO-8859-1') as f:
            neg.update(f.read().splitlines())

# Define personal pronouns
personal_pronouns = ["i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them"]

# Create an empty DataFrame to store the analysis results
output_data = pd.DataFrame(columns=['URL_ID', 'URL', 'Positive Score', 'Negative Score', 'Polarity Score',
                                     'Subjectivity Score', 'Average Sentence Length', 'Percentage of Complex Words',
                                     'Fog Index', 'Average Number of Words Per Sentence', 'Complex Word Count',
                                     'Word Count', 'Syllable Per Word', 'Personal Pronouns', 'Average Word Length'])

# Define the path to your input data file
input_file_path = "Input.xlsx"

# Load input data for analysis
input_data_analysis = pd.read_excel(input_file_path)

# Loop through each row in the input data for analysis
for index, row in input_data_analysis.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    try:
        # Send a GET request to the URL
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, headers=headers, allow_redirects=True)
        response.raise_for_status()  # Raise an exception for bad responses

        # Use BeautifulSoup to parse HTML and extract only article title and text
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract article title
        title = soup.find('h1').get_text() if soup.find('h1') else ""

        # Extract article text 
        article_text = ""
        try:
            for p in soup.find_all('p'):
                article_text += p.get_text()
        except:
            print(f"Error extracting text from URL {url_id}")

        if article_text:
            # Save the extracted article text to a text file with URL_ID as its file name
            text_file_path = os.path.join(text_dir, f"{url_id}.txt")
            with open(text_file_path, 'w', encoding='utf-8') as text_file:
                text_file.write(title + '\n' + article_text)

            # Perform sentiment analysis and other tasks using the extracted text
        

            # Perform sentiment analysis using TextBlob
            blob = TextBlob(article_text.lower())

            # Filter out stop words and clean the text
            words = [re.sub(r'[^\w\s]', '', word) for word in blob.words if word.lower() not in stop_words]

            # Positive and Negative Score Calculation
            positive_score = sum(1 for word in words if word in pos)
            negative_score = sum(1 for word in words if word in neg)

            # Polarity Score Calculation
            rounded_polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
            rounded_polarity_score = max(-1, min(1, rounded_polarity_score))

            # Subjectivity Score Calculation
            subjectivity_score = (positive_score + abs(negative_score)) / (len(words) + 0.000001)

            # Analysis of Readability
            sentences = sent_tokenize(article_text)
            avg_sentence_length = sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences)

            # Percentage of Complex Words Calculation
            complex_words = [word for word in words if syllable_count(word) > 2]
            percentage_of_complex_words = len(complex_words) / len(words)

            # Fog Index Calculation
            fog_index = 0.4 * (avg_sentence_length + percentage_of_complex_words)

            # Average Number of Words Per Sentence Calculation
            avg_words_per_sentence = len(words) / len(sentences)

            # Complex Word Count Calculation
            complex_word_count = len(complex_words)

            # Word Count Calculation
            word_count = len(words)

            # Syllable Count Per Word Calculation
            syllables_per_word = sum(syllable_count(word) for word in words) / len(words)

            # Tokenize words using NLTK for personal pronouns count
            tokenized_words = word_tokenize(article_text)

            personal_pronouns_count = sum(1 for word in tokenized_words if word.lower() in personal_pronouns)

            # Average Word Length Calculation
            avg_word_length = sum(len(word) for word in words) / len(words)

            # Populate the analysis results dictionary
            analysis_results = {
                'URL_ID': url_id,
                'URL': url,
                'Positive Score': positive_score,
                'Negative Score': negative_score,
                'Polarity Score': rounded_polarity_score,
                'Subjectivity Score': subjectivity_score,
                'Average Sentence Length': avg_sentence_length,
                'Percentage of Complex Words': percentage_of_complex_words,
                'Fog Index': fog_index,
                'Average Number of Words Per Sentence': avg_words_per_sentence,
                'Complex Word Count': complex_word_count,
                'Word Count': word_count,
                'Syllable Per Word': syllables_per_word,
                'Personal Pronouns': personal_pronouns_count,
                'Average Word Length': avg_word_length
            }

            # Append the analysis results to the output_data DataFrame
            output_data = output_data.append(analysis_results, ignore_index=True)

        else:
            print(f"No article content found for URL {url}")

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred for {url}: {http_err}")
    except Exception as e:
        print(f"Error extracting text from URL {url}: {e}")

# Save the analysis results to a new Excel file
output_data.to_excel(output_file_path, index=False)

# Print a message indicating the completion of text analysis
print("Text analysis completed successfully.")


HTTP error occurred for https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
HTTP error occurred for https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/
HTTP error occurred for https://insights.blackcoffer.com/ensuring-growth-through-insurance-technology/: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/ensuring-growth-through-insurance-technology/
Text analysis completed successfully.
