In [1]:
import nltk
import re
import pandas as pd

import os
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arnav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
folder_path='StopWords'
text_files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]

In [4]:
stopword_dic=[]
for file_name in text_files:
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'r') as file:
            stopword_dic = set(file.read().split())

In [5]:
Pos_dict_path=os.path.join('MasterDictionary', 'positive-words.txt')
Neg_dict_path=os.path.join('MasterDictionary', 'negative-words.txt')

In [6]:
negative_word_dic=[] 
with open(Neg_dict_path, 'r') as file:
    negative_word_dic = set(file.read().split())

In [7]:
negative_word_dic

{'extremists',
 'dangerous',
 'freak',
 'maniac',
 'malicious',
 'unwell',
 'spitefully',
 'anger',
 'fooled',
 'ineloquent',
 'dissenter',
 'freezing',
 'insurrection',
 'leery',
 'interrupt',
 'noxious',
 'crooked',
 'declining',
 'remorselessness',
 'bid-rigging',
 'bitchy',
 'obstruct',
 'disconcerted',
 'deadweight',
 'disquieting',
 'grimace',
 'reprovingly',
 'grievously',
 'erode',
 'superfluous',
 'demon',
 'fails',
 'misguide',
 'uncompromising',
 'worsen',
 'undecided',
 'pessimistically',
 'obstruction',
 'bitch',
 'crummy',
 'predicament',
 'intimidating',
 'set-up',
 'tragedy',
 'boycott',
 'murderous',
 'intefere',
 'bulkyness',
 'fallout',
 'renunciation',
 'avalanche',
 'dust',
 'mistakenly',
 'redundancy',
 'overzelous',
 'outmoded',
 'capriciousness',
 'intimidate',
 'pains',
 'incoherently',
 'sneering',
 'aggressive',
 'mispronounce',
 'maliciously',
 'besiege',
 'quandary',
 'sty',
 'unsure',
 'brutally',
 'denigrate',
 'dishonorable',
 'sanctimonious',
 'ugly',
 

In [8]:
positive_word_dic=[] 
with open(Pos_dict_path, 'r') as file:
    positive_word_dic = set(file.read().split())

In [9]:
positive_word_dic

{'fearless',
 'peaceable',
 'favour',
 'wonderful',
 'golden',
 'detachable',
 'merriment',
 'personages',
 'upscale',
 'adroitly',
 'comfortable',
 'inspiration',
 'luxurious',
 'modesty',
 'celebratory',
 'rightful',
 'dexterous',
 'energy-efficient',
 'sustainable',
 'bless',
 'smartly',
 'lively',
 'heroically',
 'innovative',
 'leading',
 'worthwhile',
 'healthful',
 'courageously',
 'reconciliation',
 'stellar',
 'admire',
 'stimulates',
 'frugal',
 'orderly',
 'compact',
 'remarkable',
 'delight',
 'exaltingly',
 'respect',
 'maneuverable',
 'imaculate',
 'stirringly',
 'finest',
 'magnanimous',
 'adore',
 'proficient',
 'roomy',
 'self-sufficiency',
 'subsidized',
 'vigilance',
 'sweeping',
 'amenity',
 'revives',
 'blissful',
 'trusted',
 'adoring',
 'civilize',
 'upliftment',
 'marvelously',
 'illuminate',
 'togetherness',
 'infallibility',
 'smartest',
 'wowed',
 'cheaper',
 'congenial',
 'unforgettable',
 'peace',
 'succeeding',
 'charitable',
 'enticing',
 'superiority',
 

In [10]:
def clean_text(text):
    words = word_tokenize(text)
    cleaned_words = [word.lower() for word in words if word.isalpha() and word.lower() not in stopword_dic]
    return cleaned_words

In [11]:
def calculate_positive_score(text, positive_dict):
    positive_words = [word for word in text if word in positive_dict]
    return len(positive_words)

In [12]:
def calculate_negative_score(text, negative_dict):
    negative_words = [word for word in text if word in negative_dict]
    return len(negative_words) * -1

In [13]:
def calculate_polarity_score(positive_score, negative_score):
    return (positive_score - negative_score) / (positive_score + negative_score + 0.000001)

In [14]:
def calculate_subjectivity_score(positive_score, negative_score, total_words):
    return (positive_score + negative_score) / (total_words + 0.000001)

In [15]:
def analyze_readability(text):
    sentences = sent_tokenize(text)
    total_words = len(clean_text(text))
    average_sentence_length = total_words / len(sentences)
    
    complex_words = [word for word in clean_text(text) if syllable_count(word) > 2]
    percentage_complex_words = len(complex_words) / total_words
    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)

    average_words_per_sentence = total_words / len(sentences)
    complex_word_count = len(complex_words)

    return average_sentence_length, percentage_complex_words, fog_index, average_words_per_sentence, complex_word_count, total_words

In [16]:
def syllable_count(word):
    vowels = "aeiouy"
    count = 0

    # Handle words ending with "es" and "ed"
    if word.endswith(("es", "ed")):
        pass
    else:
        for char in word:
            if char.lower() in vowels:
                count += 1

    return count

In [17]:
def calculate_syllables_per_word(text):
    words = clean_text(text)
    syllables = sum(syllable_count(word) for word in words)
    return syllables / len(words)

In [18]:
def count_personal_pronouns(text):
    personal_pronouns = re.findall(r'\b(?:I|we|my|ours|us)\b', text, flags=re.IGNORECASE)
    return len(personal_pronouns)

In [19]:
def calculate_average_word_length(text):
    words = clean_text(text)
    total_characters = sum(len(word) for word in words)
    return total_characters / len(words)

In [20]:
output_data = []

In [21]:
input_data=[]
input_path=os.path.join('Extracted Data','blackassign0001')
with open(input_path, 'r') as file:
     input_data= file.read()
input_data

'ML and AI-based insurance premium model to predict premium to be charged by the insurance company\n\nWe have seen a huge development and dependence of people on technology in recent years. We have also seen the development of AI and ChatGPT in recent years. So it is a normal thing that we will become fully dependent on technology by 2040. Information technology will be a major power for all the developing nations. As a member of a developing nation, India is rapidly growing its IT base. It has also grown some IT cities which will be the major control centres for Information technology by 2040.\nRising IT cities\n\nNoida:- Noida in Uttar Pradesh near New Delhi is an emerging IT sector now. Many large companies like Google, Microsoft, IBM, Infosys and others have set up their companies here. Noida has a market base of billions of dollars and is doing a great job of boosting the national economy. The establishment of so many software companies has made Noida an information technology hub

In [22]:
cleaned_text = " ".join(clean_text(input_data))
positive_score = calculate_positive_score(cleaned_text, positive_word_dic)
negative_score = calculate_negative_score(cleaned_text, negative_word_dic)
polarity_score = calculate_polarity_score(positive_score, negative_score)
subjectivity_score = calculate_subjectivity_score(positive_score, negative_score, len(cleaned_text))

avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count, word_count = analyze_readability(input_data)

syllables_per_word = calculate_syllables_per_word(input_data)
personal_pronouns = count_personal_pronouns(input_data)
avg_word_length = calculate_average_word_length(input_data)

In [23]:
output_data.append([positive_score, negative_score, polarity_score, subjectivity_score,
                        avg_sentence_length, percentage_complex_words, fog_index,
                        avg_words_per_sentence, complex_word_count, word_count,
                        syllables_per_word, personal_pronouns, avg_word_length])

In [24]:
output_columns = ["POSITIVE SCORE", "NEGATIVE SCORE", "POLARITY SCORE", "SUBJECTIVITY SCORE",
                  "AVG SENTENCE LENGTH", "PERCENTAGE OF COMPLEX WORDS", "FOG INDEX",
                  "AVG NUMBER OF WORDS PER SENTENCE", "COMPLEX WORD COUNT", "WORD COUNT",
                  "SYLLABLE PER WORD", "PERSONAL PRONOUNS", "AVG WORD LENGTH"]


In [25]:
output_df = pd.DataFrame(output_data, columns=output_columns)

# Save the output DataFrame to Excel
output_df.to_excel("Output Data Structure.xlsx", index=False)