In [70]:
!pip install syllapy

Collecting syllapy
  Downloading syllapy-0.7.2-py3-none-any.whl.metadata (854 bytes)
Downloading syllapy-0.7.2-py3-none-any.whl (24 kB)
Installing collected packages: syllapy
Successfully installed syllapy-0.7.2


In [71]:
import re
import nltk
import syllapy
from textblob import TextBlob
from collections import Counter
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize

In [27]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [21]:
nltk_stop_words = set(stopwords.words('english'))

In [19]:
def load_word_list(filename):
    with open(filename, 'r', encoding = 'latin -1') as file:
        return set(word.strip().lower() for word in file.readlines())

# Load positive and negative words
positive_words = load_word_list("positive-words.txt")
negative_words = load_word_list("negative-words.txt")

# Creating the Stop words File

In [22]:
custom_stop_words = nltk_stop_words.union(positive_words).union(negative_words)

print(f"Total custom stop-words: {len(custom_stop_words)}")

Total custom stop-words: 6964


In [24]:
def clean_text(text, stop_words):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    words = word_tokenize(text)  # Tokenize text into words
    filtered_words = [word for word in words if word not in stop_words]  # Remove stop-words
    return filtered_words

In [81]:
def calculate_metrics(text):
    sentences = nltk.sent_tokenize(text)
    words = clean_text(text, custom_stop_words)

    # Positive and negative scores
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)

    # Word and sentence metrics
    total_word_count = len(words)
    avg_sentence_length = total_word_count / len(sentences)

    # Complex words: Words with more than 2 syllables
    def count_syllables(word):
        vowels = "aeiouy"
        word = word.lower()
        count = sum(1 for i in range(len(word)) if word[i] in vowels and (i == 0 or word[i-1] not in vowels))
        return count

    complex_words = [word for word in words if count_syllables(word) > 2]
    complex_word_count = len(complex_words)
    percent_complex_words = (complex_word_count / total_word_count) * 100 if total_word_count > 0 else 0

    # Fog Index
    fog_index = 0.4 * (avg_sentence_length + percent_complex_words)

    # Personal pronouns
    personal_pronouns = ['i', 'we', 'my', 'ours', 'us', 'you', 'he', 'she', 'him', 'her', 'they', 'them']
    personal_pronoun_count = sum(1 for word in words if word in personal_pronouns)

    return {
        "positive_score": positive_score,
        "negative_score": negative_score,
        "avg_sentence_length": avg_sentence_length,
        "percent_complex_words": percent_complex_words,
        "fog_index": fog_index,
        "avg_words_per_sentence": avg_sentence_length,
        "complex_word_count": complex_word_count,
        "total_word_count": total_word_count,
        "personal_pronoun_count": len(personal_pronouns)
    }
def syllable_count(word):
    word = word.lower()
    vowels = "aeiou"
    count = 0
    prev_char_is_vowel = False
    for char in word:
        if char in vowels:
            if not prev_char_is_vowel:
                count += 1
            prev_char_is_vowel = True
        else:
            prev_char_is_vowel = False
    if word.endswith("e"):
        count = max(1, count - 1)
    return count

# File 1

In [82]:
file_path = '/content/10744.4.txt'
with open(file_path, 'r', encoding='utf-8') as file:
  text = file.read()
  # Calculate metrics
  metric = calculate_metrics(text)
  print(metric)

{'positive_score': 0, 'negative_score': 0, 'avg_sentence_length': 9.646153846153846, 'percent_complex_words': 38.43700159489633, 'fog_index': 19.23326217642007, 'avg_words_per_sentence': 9.646153846153846, 'complex_word_count': 241, 'total_word_count': 627, 'personal_pronoun_count': 12}


# File 2

In [83]:
file_path = '/content/11206.2.txt'
with open(file_path, 'r', encoding='utf-8') as file:
  text = file.read()
  # Calculate metrics
  metric = calculate_metrics(text)
  print(metric)

{'positive_score': 0, 'negative_score': 0, 'avg_sentence_length': 8.195652173913043, 'percent_complex_words': 44.03183023872679, 'fog_index': 20.890992965055933, 'avg_words_per_sentence': 8.195652173913043, 'complex_word_count': 166, 'total_word_count': 377, 'personal_pronoun_count': 12}


# File 3

In [84]:
file_path = '/content/12129.8.txt'
with open(file_path, 'r', encoding='utf-8') as file:
  text = file.read()
  # Calculate metrics
  metric = calculate_metrics(text)
  print(metric)

{'positive_score': 0, 'negative_score': 0, 'avg_sentence_length': 9.72972972972973, 'percent_complex_words': 33.05555555555556, 'fog_index': 17.114114114114113, 'avg_words_per_sentence': 9.72972972972973, 'complex_word_count': 119, 'total_word_count': 360, 'personal_pronoun_count': 12}


# File 4

In [85]:
file_path = '/content/123.0.txt'
with open(file_path, 'r', encoding='utf-8') as file:
  text = file.read()
  # Calculate metrics
  metric = calculate_metrics(text)
  print(metric)

{'positive_score': 0, 'negative_score': 0, 'avg_sentence_length': 11.15, 'percent_complex_words': 49.66367713004484, 'fog_index': 24.325470852017936, 'avg_words_per_sentence': 11.15, 'complex_word_count': 443, 'total_word_count': 892, 'personal_pronoun_count': 12}
