### Project - Text Analysis,Sentimental Analysis

In [1]:
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Load Stop Words

In [3]:
stop_words_file = "C:\\Users\\Dell\\Desktop\\Sentiment Analysis\\StopWords\\StopWords_Auditor.txt"
stop_words = set()

with open(stop_words_file, 'r') as f:
    stop_words.update(f.read().lower().splitlines())

In [4]:
import os

def load_stopwords_from_files(stop_words_dir):
    stop_words = set()
    
    # Loop through all entries in the directory
    for file_name in os.listdir(stop_words_dir):
        file_path = os.path.join(stop_words_dir, file_name)
        
        # Check if the entry is a file and ends with .txt
        if os.path.isfile(file_path) and file_name.endswith('.txt'):
            try:
                # Try utf-8 and fallback to ISO-8859-1
                with open(file_path, 'r', encoding='utf-8') as file:
                    stop_words.update(file.read().lower().splitlines())
            except UnicodeDecodeError:
                print(f"UnicodeDecodeError for file: {file_name}, retrying with ISO-8859-1.")
                with open(file_path, 'r', encoding='ISO-8859-1') as file:
                    stop_words.update(file.read().lower().splitlines())
    
    return stop_words

# Specify the directory containing stopword files
stop_words_dir = "C:\\Users\\Dell\\Desktop\\Sentiment Analysis\\StopWords"

# Load stopwords
stop_words = load_stopwords_from_files(stop_words_dir)

# Print a sample to confirm (Optional for debugging)
print(f"Loaded {len(stop_words)} stopwords.")

UnicodeDecodeError for file: StopWords_Currencies.txt, retrying with ISO-8859-1.
Loaded 12768 stopwords.


In [5]:
import os

master_dict_dir = "MasterDictionary"
positive_words_file = os.path.join(master_dict_dir, "C:\\Users\\Dell\\Desktop\\Sentiment Analysis\\MasterDictionary\\positive-words.txt")
negative_words_file = os.path.join(master_dict_dir, "C:\\Users\\Dell\\Desktop\\Sentiment Analysis\\MasterDictionary\\negative-words.txt")

positive_words = set()
negative_words = set()

# Load Positive Words
with open(positive_words_file, 'r') as f:
    positive_words.update(f.read().lower().splitlines())

# Load Negative Words
with open(negative_words_file, 'r') as f:
    negative_words.update(f.read().lower().splitlines())

# Print Loaded Words (Optional for Debugging)
print(f"Loaded {len(positive_words)} positive words.")
print(f"Loaded {len(negative_words)} negative words.")


Loaded 2006 positive words.
Loaded 4783 negative words.


In [6]:
with open(os.path.join(master_dict_dir, "positive-words.txt"), 'r') as f:
    positive_words.update(f.read().lower().splitlines())

In [7]:
with open(os.path.join(master_dict_dir, "negative-words.txt"), 'r') as f:
    negative_words.update(f.read().lower().splitlines())


In [8]:
# Text Cleaning and Tokenization
def clean_and_tokenize(text):
    text = re.sub(r'[\W_]+', ' ', text).lower()
    words = word_tokenize(text)
    cleaned_words = [word for word in words if word not in stop_words]
    return cleaned_words

In [9]:
# Derived Variable Calculation
def calculate_scores(text):
    words = clean_and_tokenize(text)
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 1e-6)
    subjectivity_score = (positive_score + negative_score) / (len(words) + 1e-6)
    return positive_score, negative_score, polarity_score, subjectivity_score

In [10]:
# Readability Analysis
def readability_analysis(text):
    sentences = sent_tokenize(text)
    words = clean_and_tokenize(text)
    word_count = len(words)
    sentence_count = len(sentences)
    
    avg_sentence_length = word_count / sentence_count if sentence_count else 0

    def count_syllables(word):
        word = word.lower()
        vowels = "aeiou"
        syllables = sum(1 for char in word if char in vowels)
        if word.endswith(('es', 'ed')):
            syllables = max(1, syllables - 1)
        return syllables
    complex_words = [word for word in words if count_syllables(word) > 2]
    percentage_complex_words = len(complex_words) / word_count if word_count else 0

    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    return avg_sentence_length, percentage_complex_words, fog_index    

In [11]:
# Other Metrics
def additional_metrics(text):
    words = clean_and_tokenize(text)
    total_chars = sum(len(word) for word in words)
    avg_word_length = total_chars / len(words) if words else 0

    personal_pronouns = len(re.findall(r"\b(i|we|my|ours|us)\b", text, re.IGNORECASE))

    syllable_count_per_word = [count_syllables(word) for word in words]

    return len(words), avg_word_length, personal_pronouns, syllable_count_per_word


In [12]:
# Main Execution
if __name__ == "__main__":
    sample_text = "Your sample financial text goes here."

    # Derived Variables
    pos_score, neg_score, polarity, subjectivity = calculate_scores(sample_text)
    print(f"Positive Score: {pos_score}")
    print(f"Negative Score: {neg_score}")
    print(f"Polarity Score: {polarity}")
    print(f"Subjectivity Score: {subjectivity}")

Positive Score: 0
Negative Score: 0
Polarity Score: 0.0
Subjectivity Score: 0.0


### Analysis of Readability

In [13]:
def readability_analysis(text):
    sentences = sent_tokenize(text)
    words = clean_and_tokenize(text)
    word_count = len(words)
    sentence_count = len(sentences)
    
    avg_sentence_length = word_count / sentence_count if sentence_count else 0

    def count_syllables(word):
        word = word.lower()
        vowels = "aeiou"
        syllables = sum(1 for char in word if char in vowels)
        if word.endswith(('es', 'ed')):
            syllables = max(1, syllables - 1)
        return syllables

    complex_words = [word for word in words if count_syllables(word) > 2]
    percentage_complex_words = len(complex_words) / word_count if word_count else 0

    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    return avg_sentence_length, percentage_complex_words, fog_index


In [14]:
sample_text = "Your sample text goes here."
avg_sentence_length, perc_complex_words, fog_index = readability_analysis(sample_text)

print(f"Average Sentence Length: {avg_sentence_length}")
print(f"Percentage of Complex Words: {perc_complex_words}")
print(f"Fog Index: {fog_index}")

Average Sentence Length: 1.0
Percentage of Complex Words: 0.0
Fog Index: 0.4


In [15]:
# Additional Metrics
def additional_metrics(text):
    words = clean_and_tokenize(text)
    total_chars = sum(len(word) for word in words)
    avg_word_length = total_chars / len(words) if words else 0

    personal_pronouns = len(re.findall(r"\\b(i|we|my|ours|us)\\b", text, re.IGNORECASE))

    def count_syllables(word):
        word = word.lower()
        vowels = "aeiou"
        syllables = sum(1 for char in word if char in vowels)
        if word.endswith(('es', 'ed')):
            syllables = max(1, syllables - 1)
        return syllables

    syllable_count_per_word = [count_syllables(word) for word in words]

    return len(words), avg_word_length, personal_pronouns, syllable_count_per_word


In [16]:
sample_text = "Your sample text goes here."
word_count, avg_word_length, personal_pronouns, syllable_counts = additional_metrics(sample_text)

print(f"Word Count: {word_count}")
print(f"Average Word Length: {avg_word_length}")
print(f"Personal Pronouns: {personal_pronouns}")
print(f"Syllable Counts: {syllable_counts}")

Word Count: 1
Average Word Length: 4.0
Personal Pronouns: 0
Syllable Counts: [1]
