In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as VaderSentimentIntensityAnalyzer
from nltk.sentiment.util import mark_negation
from nltk import tokenize
import re

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [5]:
# Function to count syllables in a word
def count_syllables(word):
    vowels = 'aeiouAEIOU'
    exceptions = ['es', 'ed']
    count = 0

    # Handle exceptions
    if word.endswith(tuple(exceptions)):
        return count

    if len(word) > 0 and word[0] in vowels:
        count += 1

    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1

    if word.endswith('e'):
        count -= 1

    if count == 0 and len(word) > 0:
        count += 1

    return count

In [6]:
# Read the Excel file containing the URLs
file_path = 'C:/Users/shiva/Documents/Assignment/Input.xlsx'
df = pd.read_excel(file_path)

# Extract the URLs from the Excel file
urls = df['URL'].tolist()

In [7]:
# Read the CSV file containing stopwords
stopwords_file_path = 'C:/Users/shiva/Documents/Assignment/Stopwords.csv'
stopwords_df = pd.read_csv(stopwords_file_path, encoding='latin-1')

# Extract the stopwords from the CSV file
stopwords_list = stopwords_df['Stopwords'].tolist()

In [8]:
# Read the text file containing negative words
negative_words_file_path = 'C:/Users/shiva/Documents/Assignment/MasterDictionary/negative-words.txt'
with open(negative_words_file_path, 'r') as file:
    negative_words = file.read().splitlines()

In [9]:
# Read the text file containing positive words
positive_words_file_path = 'C:/Users/shiva/Documents/Assignment/MasterDictionary/positive-words.txt'
with open(positive_words_file_path, 'r') as file:
    positive_words = file.read().splitlines()

In [10]:
# Initialize sentiment analyzers
sia = SentimentIntensityAnalyzer()
vader_sia = VaderSentimentIntensityAnalyzer()

In [None]:
# Initialize results list
results = []

In [14]:
# Define the personal pronouns regex pattern
pronouns_pattern = r"\b(I|we|my|ours|us)\b"

In [15]:
for url in urls:
    # Send a request to the URL and retrieve the web page contents
    response = requests.get(url)
    html = response.text
    # Use BeautifulSoup to extract the text from HTML
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text()
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    
    
     # Initialize variables
    cleaned_words = []
    personal_pronoun_count = 0
    # Initialize variables
    syllable_count = 0
    # Initialize scores
    positive_score = 0
    negative_score = 0
    polarity_score = 0
    subjectivity_score = 0
    word_count = 0
    complex_word_count = 0
    
    
    
    # Process each sentence
    for sentence in sentences:
        
        # Tokenize the sentence into words
        tokens = tokenize.word_tokenize(sentence)
        # Remove stopwords, negative words, and positive words from the tokens
        filtered_tokens = [token for token in tokens if token.lower() not in stop_words and token.lower() not in negative_words and token.lower() not in positive_words]
        # Add cleaned words to the list
        cleaned_words.extend(filtered_tokens)
        # Count personal pronouns
        personal_pronoun_count += len(re.findall(pronouns_pattern, ' '.join(filtered_tokens), re.IGNORECASE))
        # Update word and complex word counts
        word_count += len(filtered_tokens)
        complex_word_count += len([token for token in filtered_tokens if len(token) > 2])
        
        # Count syllables in each word
        for word in filtered_tokens:
            syllable_count += count_syllables(word)
        
        # Calculate scores
        if filtered_tokens:
            sentiment = sia.polarity_scores(' '.join(filtered_tokens))
            vader_sentiment = vader_sia.polarity_scores(' '.join(mark_negation(filtered_tokens)))
            positive_score += vader_sentiment['pos']
            negative_score += vader_sentiment['neg']
            polarity_score += sentiment['compound']
            subjectivity_score += sentiment['neu'] + sentiment['pos'] + sentiment['neg']
            

    # Calculate readability metrics
    average_sentence_length = word_count / len(sentence)
    percentage_complex_words = (complex_word_count / word_count) * 100
    Fog_Index = 0.4 *(average_sentence_length + percentage_complex_words)

    # Average word count per sentence
    average_words_per_sentence = word_count / len(sentence)

    # Calculate complex words
    complex_words = [word for word in cleaned_words if count_syllables(word) > 2]
    complex_word_count = len(complex_words)

    # Count the total cleaned words
    total_cleaned_words = len(cleaned_words)

    # Calculate average word length
    total_characters = sum(len(word) for word in cleaned_words)
    average_word_length = total_characters / len(cleaned_words)

    # Create a dictionary of results for the current URL
    result = {
        'URL': url,
        'Positive Score': positive_score,
        'Negative Score': negative_score,
        'Polarity Score': polarity_score,
        'Subjectivity Score': subjectivity_score,
        'Average Sentence Length': average_sentence_length,
        'Percentage Complex Words': percentage_complex_words,
        'Fog Index': Fog_Index,
        'Average Words Per Sentence': average_words_per_sentence,
        'Total Cleaned Words': total_cleaned_words,
        'Complex Word Count': complex_word_count,
        'Syllable Count': syllable_count,
        'Personal Pronoun Count': personal_pronoun_count,
        'Average Word Length:': average_word_length
    }
    
    # Append the result to the results list
    results.append(result)

# Create a DataFrame from the results list
df_results = pd.DataFrame(results)

# Save the DataFrame to an XLSX file
output_file_path = 'C:/Users/shiva/Documents/Assignment/Output Data Structure.xlsx'
df_results.to_excel(output_file_path)

print("Output saved to:", output_file_path)


Output saved to: C:/Users/shiva/Documents/Assignment/Output Data Structure.xlsx
