In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import os
import nltk
import nltk
nltk.download('vader_lexicon')


# Download the CMU Pronouncing Dictionary from NLTK
nltk.download('cmudict')

# Read the output structure Excel file
output_structure_df = pd.read_excel('Output Data Structure.xlsx')

# Function to preprocess the text
def preprocess_text(text):
    # Tokenize text into words
    words = word_tokenize(text.lower())  # Convert text to lowercase
    
    # Remove stop words and punctuation
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
    
    return filtered_words

# Function to count syllables in a word
def count_syllables(word):
    return sum([len(list(y for y in x if y[-1].isdigit())) for x in nltk.corpus.cmudict.dict().get(word.lower(), [])])

# Function to compute variables
def compute_variables(article_text):
    # Tokenize text into words and sentences
    words = preprocess_text(article_text)
    sentences = sent_tokenize(article_text)
    
    # Compute variables
    word_count = len(words)
    sentence_count = len(sentences)
    avg_sentence_length = sum(len(sent.split()) for sent in sentences) / sentence_count if sentence_count > 0 else 0
    percentage_complex_words = sum(1 for word in words if len(word) > 6) / word_count * 100 if word_count > 0 else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = word_count / sentence_count if sentence_count > 0 else 0
    complex_word_count = sum(1 for word in words if len(word) > 6)
    syllable_count = sum(count_syllables(word) for word in words)
    avg_syllables_per_word = syllable_count / word_count if word_count > 0 else 0
    personal_pronouns = sum(1 for word in words if word.lower() in ['i', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves'])
    avg_word_length = sum(len(word) for word in words) / word_count if word_count > 0 else 0
    
    # Sentiment analysis
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = sia.polarity_scores(article_text)
    positive_score = sentiment_scores['pos']
    negative_score = sentiment_scores['neg']
    polarity_score = sentiment_scores['compound']
    
    # TextBlob's sentiment analysis
    subjectivity_score = TextBlob(article_text).sentiment.subjectivity
    
    return positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count, word_count, avg_syllables_per_word, personal_pronouns, avg_word_length

# Create a list to store computed variables for each article
output_data = []

# Iterate through the extracted text files
extracted_texts_dir = 'article_texts'
for filename in os.listdir(extracted_texts_dir):
    if filename.endswith('.txt'):
        # Read the extracted text from the file
        with open(os.path.join(extracted_texts_dir, filename), 'r', encoding='utf-8') as file:
            article_text = file.read()
        
        # Compute variables for the article text
        variables = compute_variables(article_text)
        
        # Append the variables to the output data
        output_data.append(variables)

# Convert the output data to a DataFrame
output_df = pd.DataFrame(output_data, columns=output_structure_df.columns)

# Save the output DataFrame to an Excel file
output_df.to_excel('output.xlsx', index=False)

print("Data analysis completed. Output saved to 'output.xlsx'.")


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/rohansridhar/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/rohansridhar/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
