In [6]:
import os
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import string
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

# Initialize VADER Sentiment Intensity Analyzer
sid = SentimentIntensityAnalyzer()

# Function to clean the text
def clean_text(text):
    stop_words = set(stopwords.words('english'))
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    return tokens

# Function to calculate average sentence length
def avg_sentence_length(text):
    sentences = sent_tokenize(text)
    return len(word_tokenize(text)) / len(sentences)

# Function to calculate percentage of complex words
def percentage_complex_words(text):
    words = word_tokenize(text)
    complex_words = [word for word in words if len(word) > 2]  # Words with more than 2 characters
    return (len(complex_words) / len(words)) * 100

# Function to calculate fog index
def fog_index(text):
    avg_sent_length = avg_sentence_length(text)
    pct_complex_words = percentage_complex_words(text)
    return 0.4 * (avg_sent_length + pct_complex_words)

# Function to calculate average number of words per sentence
def avg_words_per_sentence(text):
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    return len(words) / len(sentences)

# Function to count complex words
def complex_word_count(text):
    words = word_tokenize(text)
    complex_words = [word for word in words if len(word) > 2]  # Words with more than 2 characters
    return len(complex_words)

# Function to count total words
def word_count(text):
    words = word_tokenize(text)
    return len(words)

# Function to count syllables in a word
def count_syllables(word):
    word = word.lower()
    count = 0
    vowels = 'aeiouy'
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith('e'):
        count -= 1
    if count == 0:
        count = 1
    return count

# Function to calculate syllables per word
def syllables_per_word(text):
    words = word_tokenize(text)
    syllable_count = sum(count_syllables(word) for word in words)
    return syllable_count / len(words)

# Function to count personal pronouns
def personal_pronouns(text):
    pronouns = re.findall(r'\b(?:I|we|my|ours|us)\b', text, flags=re.IGNORECASE)
    return len(pronouns)

# Function to calculate average word length
def avg_word_length(text):
    words = word_tokenize(text)
    total_characters = sum(len(word) for word in words)
    return total_characters / len(words)

# Process each text file and generate output
def process_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Clean the text
    cleaned_text = ' '.join(clean_text(text))

    # Perform sentiment analysis using VADER
    sentiment_scores = sid.polarity_scores(text)

    # Calculate additional metrics
    avg_sent_length = avg_sentence_length(cleaned_text)
    pct_complex_words = percentage_complex_words(cleaned_text)
    fog_idx = fog_index(cleaned_text)
    avg_words_per_sent = avg_words_per_sentence(cleaned_text)
    complex_count = complex_word_count(cleaned_text)
    total_words = word_count(cleaned_text)
    syllables_per_word_count = syllables_per_word(cleaned_text)
    personal_pronouns_count = personal_pronouns(cleaned_text)
    avg_word_length_val = avg_word_length(cleaned_text)

    # Calculate Polarity Score
    positive_score = sentiment_scores['pos']
    negative_score = sentiment_scores['neg']
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)

    # Calculate Subjectivity Score
    subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)

    return {
        'Positive Score': positive_score,
        'Negative Score': negative_score,
        'Polarity Score': polarity_score,
        'Subjectivity Score': subjectivity_score,
        'Average Sentence Length': avg_sent_length,
        'Percentage of Complex Words': pct_complex_words,
        'Fog Index': fog_idx,
        'Average Number of Words Per Sentence': avg_words_per_sent,
        'Complex Word Count': complex_count,
        'Word Count': total_words,
        'Syllables Per Word': syllables_per_word_count,
        'Personal Pronouns Count': personal_pronouns_count,
        'Average Word Length': avg_word_length_val
    }

# Process all text files in a directory
def process_text_files_in_directory(directory):
    output_data = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory, filename)
            output = process_text_file(file_path)
            output_data.append(output)
    return output_data

# Directory containing text files
input_directory = '../../DSML28/Blackcofferr/Extracted_Text_Files/'

# Process all text files in the directory
output_data = process_text_files_in_directory(input_directory)

# Create DataFrame from output data
output_df = pd.DataFrame(output_data)

# Read original Excel file containing 'URL_ID' and 'URL' columns
original_excel_file = '../../DSML28/Blackcofferr/Output Data Structure - Blackcoffer.xlsx'
original_df = pd.read_excel(original_excel_file)

# Merge original DataFrame with output DataFrame
merged_df = pd.concat([original_df[['URL_ID', 'URL']], output_df], axis=1)

# Output Excel file
output_excel_file = 'output.xlsx'

# Write merged DataFrame to Excel
merged_df.to_excel(output_excel_file, index=False)

print("Output written to", output_excel_file)


[nltk_data] Downloading package punkt to C:\Users\Vivek
[nltk_data]     Ghodmare\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Vivek
[nltk_data]     Ghodmare\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to C:\Users\Vivek
[nltk_data]     Ghodmare\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Output written to output.xlsx
