In [25]:
import pandas as pd
import requests
import string
import textstat
import nltk
from io import BytesIO
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize

In [26]:
# function for extraction of text from URL
def extract_data_from_url(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            # content selection by class_name & attribute
            content_elements =soup.find_all(class_='td-post-content tagdiv-type') + soup.find_all(attrs={"data-td-block-uid": "tdi_130"})
            content = '\n'.join(element.get_text() for element in content_elements)
            return content
    except requests.RequestException as e:
        print(f"Error fetching data from {url}: {e}")
    return None

In [27]:
def read_words(file_path):
    with open(file_path, 'r') as file:
        words = [word.strip().lower() for word in file.readlines()]
    return words

In [28]:
def calculate_positive_score(text, positive_words_file):
    positive_words = read_words(positive_words_file)
    words = word_tokenize(text.lower())
    positive_word_count = sum(1 for word in words if word in positive_words)
    total_words = len(words)
    positive_score = (positive_word_count / max(total_words, 1)) * 100
    return positive_score

In [29]:
def calculate_negative_score(text,negative_words_file):
    negative_words = read_words(negative_words_file) 
    words = word_tokenize(text.lower()) 
    negative_word_count = sum(1 for word in words if word in negative_words)
    total_words = len(words)
    negative_score = (negative_word_count / max(total_words, 1)) * 100
    return negative_score

In [30]:
def calculate_sentiment_scores(pos_score,neg_score,total_words):
    polarity_score = (pos_score - neg_score) / (pos_score + neg_score + 0.000001)
    subjectivity_score = (pos_score + neg_score) / (total_words + 0.000001)
    return polarity_score, subjectivity_score

In [31]:
def calculate_avg_word_length(text):
    words = word_tokenize(text.lower())
    total_characters = sum(len(word) for word in words)
    total_words = len(words)
    if total_words > 0:
        avg_word_length = total_characters / total_words
    else:
        avg_word_length = 0
    return avg_word_length

In [32]:
import re
def calculate_personal_pronouns(text):
    personal_pronouns = ['I', 'we', 'my', 'ours', 'us']
    pronoun_counts = {pronoun.lower(): 0 for pronoun in personal_pronouns}
    pattern = r'\b(?:{})\b(?!S)'.format('|'.join(personal_pronouns))
    matches = re.findall(pattern, text, flags=re.IGNORECASE) # for ignorning 'US' text
    for match in matches:
        pronoun_counts[match.lower()] += 1
    return pronoun_counts

In [33]:
# Function to calculate text statistics
def calculate_text_statistics(text,positive_words,negative_words):
    if text:
        try:
            sentences = sent_tokenize(text)
            total_sentences = len(sentences)
            words = word_tokenize(text)
            total_words = len(words)
            pos_score = calculate_positive_score(text,positive_words)
            neg_score = calculate_negative_score(text,negative_words)
            polarity_score, subjectivity_score = calculate_sentiment_scores(pos_score,neg_score,total_words)
            return {
                'POSITIVE SCORE': pos_score,
                'NEGATIVE SCORE':neg_score,
                'POLARITY SCORE': polarity_score,
                'SUBJECTIVITY SCORE': subjectivity_score,
                'AVG SENTENCE LENGTH': total_words / max(total_sentences, 1),
                'PERCENTAGE OF COMPLEX WORDS': (textstat.lexicon_count(text, False) / max(total_words, 1)) * 100,
                'FOG INDEX': textstat.gunning_fog(text),
                'AVG NUMBER OF WORDS PER SENTENCE': total_words / max(total_sentences, 1),
                'COMPLEX WORD COUNT': textstat.lexicon_count(text, False),
                'WORD COUNT': total_words,
                'SYLLABLE PER WORD': textstat.syllable_count(text) / max(total_words, 1),
                'PERSONAL PRONOUNS': calculate_personal_pronouns(text),
                'AVG WORD LENGTH': calculate_avg_word_length(text)
            }
        except Exception as e:
            print(f"Error calculating text statistics: {e}")
    return {}

In [34]:
# function for clean file with the help of stop_words files
def load_stop_words(file_paths):
    stop_words = set()
    for file_path in file_paths:
        with open(file_path, 'r') as file:
            words = file.read().splitlines()
            stop_words.update(words)
    return stop_words
def clean_text(text, stop_words):
    words = text.split()
    words = [word.strip(string.punctuation) for word in words]
    filtered_words = [word for word in words if word.lower() not in stop_words]
    cleaned_text = ' '.join(filtered_words)
    return cleaned_text
stop_words_files = ['StopWords/StopWords_Auditor.txt', 
                    'StopWords/StopWords_Currencies.txt',
                    'StopWords/StopWords_DatesandNumbers.txt', 
                    'StopWords/StopWords_Generic.txt', 
                    'StopWords/StopWords_GenericLong.txt', 
                    'StopWords/StopWords_Geographic.txt', 
                    'StopWords/StopWords_Names.txt']

In [35]:
#main code block
#code for import file and export resultant file(Output_Data_Structure.xlsx)

excel_url = 'Input.xlsx'
positive_words='positive-words.txt'
negative_words='negative-words.txt'
excel_data = pd.read_excel(excel_url)
new_data = pd.DataFrame(columns=[
    'URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
    'SUBJECTIVITY SCORE', 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS',
    'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT',
    'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'
])
i=0
for index, row in excel_data.iterrows():
    url = row['URL']
    url_id = row['URL_ID']
    data = extract_data_from_url(url)
    if data:
        stop_words = load_stop_words(stop_words_files)
        cleaned_text = clean_text(data, stop_words)
        statistics = calculate_text_statistics(cleaned_text,positive_words,negative_words)
        extracted_data = {
            'URL_ID': url_id,
            'URL': url,
            **statistics,
        }
        new_data = pd.concat([new_data, pd.DataFrame([extracted_data])], ignore_index=True)
new_excel_file = 'Output_Data_Structure.xlsx'
new_data.to_excel(new_excel_file, index=False)
print(f"Data saved to {new_excel_file}")

Data saved to Output_Data_Structure.xlsx
