In [11]:
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import re

In [12]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tanib\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
stop_words_combined = set()

stopwords_folder_path = "stopwords"
for filename in os.listdir(stopwords_folder_path):
    if filename.endswith(".txt"):
        with open(os.path.join(stopwords_folder_path, filename), "r") as file:
            stop_words_combined.update(file.read().splitlines())

In [18]:
with open("MasterDictionary/positive-words.txt", "r") as file:
    positive_words = set(file.read().splitlines())

with open("MasterDictionary/negative-words.txt", "r") as file:
    negative_words = set(file.read().splitlines())

In [19]:
input_data = pd.read_excel("Input - Copy.xlsx")

In [20]:
def compute_complex_word_count(text, stop_words):
    tokens = word_tokenize(text)
    complex_words = [word for word in tokens if len(word) > 2 and word not in stop_words]
    return len(complex_words)

In [21]:
def compute_syllable_per_word(word):
    syllables = re.findall(r'[aeiouAEIOU]+', word)
    exceptions = re.findall(r'\b\w*es\b|\b\w*ed\b', word)
    return len(syllables) - len(exceptions)

In [25]:
def perform_textual_analysis(folder_path, stop_words_combined, positive_words, negative_words):
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as file:
                text = file.read()
                sentences = sent_tokenize(text)
                words = word_tokenize(text)
                # Remove stop words
                words = [word for word in words if word not in stop_words_combined]
                word_count = len(words)
                sentence_count = len(sentences)
                avg_sentence_length =round(word_count / sentence_count,2)
                complex_word_count = compute_complex_word_count(text, stop_words_combined)
                percentage_complex_words = round((complex_word_count / word_count) * 100, 2)
                fog_index = round(0.4 * (avg_sentence_length + percentage_complex_words), 2)
                avg_words_per_sentence = round(word_count / sentence_count, 2)
                syllable_count_per_word = round(sum(compute_syllable_per_word(word) for word in words) / word_count, 2)
                # Calculate positive and negative scores
                positive_score = sum(1 for word in words if word in positive_words)
                negative_score = sum(1 for word in words if word in negative_words)
                polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
                subjectivity_score = (positive_score + negative_score) / (word_count + 0.000001)
                url_id = filename.split(".")[0]
                # Update input_data DataFrame with computed values
                input_data.loc[input_data['URL_ID'] == url_id, 'Complex Word Count'] = complex_word_count
                input_data.loc[input_data['URL_ID'] == url_id, 'Word Count'] = word_count
                input_data.loc[input_data['URL_ID'] == url_id, 'Avg Sentence Length'] = avg_sentence_length
                input_data.loc[input_data['URL_ID'] == url_id, 'Percentage of Complex Words'] = percentage_complex_words
                input_data.loc[input_data['URL_ID'] == url_id, 'Fog Index'] = fog_index
                input_data.loc[input_data['URL_ID'] == url_id, 'Avg Number of Words Per Sentence'] = avg_words_per_sentence
                input_data.loc[input_data['URL_ID'] == url_id, 'Syllable Per Word'] = syllable_count_per_word
                input_data.loc[input_data['URL_ID'] == url_id, 'Positive Score'] = positive_score
                input_data.loc[input_data['URL_ID'] == url_id, 'Negative Score'] = negative_score
                input_data.loc[input_data['URL_ID'] == url_id, 'Polarity Score'] = polarity_score
                input_data.loc[input_data['URL_ID'] == url_id, 'Subjectivity Score'] = subjectivity_score

In [26]:
perform_textual_analysis("extracted_data", stop_words_combined, positive_words, negative_words)

In [28]:
input_data.to_excel("Input - Copy.xlsx", index=False)