In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import brown, wordnet
from collections import Counter
from nltk import RegexpParser
from nltk.tree import Tree

In [13]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ruchi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ruchi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ruchi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\ruchi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


True

In [11]:
jargon_df = pd.read_csv(R"C:\Users\ruchi\Documents\dataset_1.csv") #DATASET
jargon_text_combined = ' '.join(jargon_df['Original']).lower()#CONVERT TO LOWERCASE

jargon_text_cleaned = re.sub(r'[^a-z\s]', '', jargon_text_combined) #REMOVE OTHER CHARS



jargon_word_freqs = Counter(jargon_text_cleaned.split())
print("✅ ANALYZED 'DATASET_1.CSV' FOR WORD FREQUENCIES.")

✅ ANALYZED 'DATASET_1.CSV' FOR WORD FREQUENCIES.


In [14]:
general_word_freqs = Counter(word.lower() for word in brown.words())
print("✅ GENERAL ENGLISH CORPUS ANALYSIS COMPLETE.")
print("-" * 30)

✅ GENERAL ENGLISH CORPUS ANALYSIS COMPLETE.
------------------------------


In [15]:
def calculate_jargon_score(word, jargon_corpus_freq, general_corpus_freq):

    freq_in_jargon = jargon_corpus_freq.get(word, 0) + 1 #ADD 1 TO AVOID ZERO DIVISION
    freq_in_general = general_corpus_freq.get(word, 0) + 1
    jargon_total = sum(jargon_corpus_freq.values())
    general_total = sum(general_corpus_freq.values())
    score = (freq_in_jargon / jargon_total) / (freq_in_general / general_total)
    return score

In [16]:
def extract_noun_phrase_from_definition(definition):

    tokens = nltk.word_tokenize(definition)
    tagged_tokens = nltk.pos_tag(tokens)

    # RE-CREATING PARSER LOCALLY FOR SELF-CONTAINMENT
    grammar_local = r""" NP: {<DT|JJ|NN.*>*<NN.*>} """
    cp_local = RegexpParser(grammar_local)
    parsed_definition = cp_local.parse(tagged_tokens)

    # LOGIC TO SELECT THE MOST RELEVANT NOUN PHRASE (PRIORITIZE THE FIRST ONE)
    for chunk in parsed_definition:
        if isinstance(chunk, Tree) and chunk.label() == 'NP':



            return " ".join([word for word, tag in chunk.leaves()]) # RETURN THE FIRST NOUN PHRASE FOUND AS A STRING

    # IF NO NOUN PHRASE IS FOUND, RETURN THE ORIGINAL DEFINITION
    return definition

In [48]:
def get_simplified_meaning(word):
    synsets = wordnet.synsets(word)
    if synsets:
        definition = synsets[0].definition()
        if definition:
            extracted_np = extract_noun_phrase_from_definition(definition)
            if extracted_np:
                return extracted_np
            else:
                return definition

        synonyms = set()
        for syn in synsets:
            for lemma in syn.lemmas():
                if lemma.name().lower() != word.lower() and '_' not in lemma.name():
                    synonyms.add(lemma.name())
        if synonyms:
            return list(synonyms)[0]

    return word
    # This function retrieves a simplified meaning for a given word,
    # prioritizing a noun phrase from its WordNet definition or a simple synonym.

In [17]:
grammar = r"""
  NP: {<DT|JJ|NN.*>*<NN.*>} # CHUNK DT, JJ, NN.* FOLLOWED BY NN.*
"""

In [50]:
noun_phrase_parser = RegexpParser(grammar)


def simplify_sentence_with_nltk(sentence):
    tokens = nltk.word_tokenize(sentence)
    tagged_tokens = nltk.pos_tag(tokens)

    parsed_sentence = noun_phrase_parser.parse(tagged_tokens)

    simplified_tokens = []
    JARGON_SCORE_THRESHOLD = 5.0
    JARGON_POS_TAGS = lambda tag: tag.startswith('NN')

    for chunk in parsed_sentence:
        if isinstance(chunk, Tree) and chunk.label() == 'NP':
            is_jargon_np = False
            jargon_word_in_np = None
            jargon_word_index_in_chunk = -1

            np_leaves = chunk.leaves()
            for j, (np_word, np_tag) in enumerate(np_leaves):
                score = calculate_jargon_score(np_word.lower(), jargon_word_freqs, general_word_freqs)
                if score > JARGON_SCORE_THRESHOLD and JARGON_POS_TAGS(np_tag):
                    is_jargon_np = True
                    jargon_word_in_np = np_word
                    jargon_word_index_in_chunk = j
                    break

            if is_jargon_np and jargon_word_in_np:
                simple_meaning_text = get_simplified_meaning(jargon_word_in_np.lower())

                modified_np_tokens = []
                for i, (np_word, np_tag) in enumerate(np_leaves):
                    if i == jargon_word_index_in_chunk:
                         modified_np_tokens.append(simple_meaning_text)
                    else:
                        modified_np_tokens.append(np_word)

                simplified_tokens.extend(modified_np_tokens)

            else:
                simplified_tokens.extend([word for word, tag in chunk.leaves()])
        else:
            simplified_tokens.append(chunk[0])

    simplified_sentence = ""
    for i, token in enumerate(simplified_tokens):
        if simplified_sentence and not token in ('.', ',', '!', '?', ':', ';', ')') and not simplified_sentence.endswith('('):
            simplified_sentence += " "
        simplified_sentence += token

    simplified_sentence = re.sub(r'\s+([?.!,:;])', r'\1', simplified_sentence)
    simplified_sentence = re.sub(r'\(\s+', '(', simplified_sentence)
    simplified_sentence = re.sub(r'\s+\)', ')', simplified_sentence)
    simplified_sentence = simplified_sentence.replace('(', '').replace(')', '')


    simplified_sentence = simplified_sentence.replace(" subclass of ", " ").replace(" where the usage is restricted to ", " ").replace(" that is ", " ")

    return simplified_sentence
    # This function simplifies a sentence by identifying and replacing
    # jargon within noun phrases using NLTK and WordNet.

In [47]:
# --- GET INPUT FROM USER AND SIMPLIFY ---
user_sentence = input("ENTER A SENTENCE TO SIMPLIFY: ")
# SIMPLIFY THE USER'S SENTENCE
simplified_output = simplify_sentence_with_nltk(user_sentence)
print("\nSIMPLIFIED SENTENCE:")
# PRINT THE SIMPLIFIED SENTENCE
print(simplified_output)

ENTER A SENTENCE TO SIMPLIFY: i am a lawyer

SIMPLIFIED SENTENCE:
i am a a professional person


##BLEU SCORE FROM GEMINI -


In [52]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Assuming jargon_df is already loaded and contains 'Original' and 'Simplified' columns

# Prepare data for BLEU score calculation
# BLEU score requires the reference to be a list of sentences (even if there's only one reference)
# and the hypothesis to be a single sentence.
# We will treat 'Original' as the reference and 'Simplified' as the hypothesis.

references = []
hypotheses = []

for index, row in jargon_df.iterrows():
    # Check if 'Original' and 'Simplified' are valid strings before processing
    original_sentence = row['Original']
    simplified_sentence = row['Simplified']

    if isinstance(original_sentence, str) and isinstance(simplified_sentence, str):
        # Each original sentence is a reference, represented as a list containing one sentence.
        references.append([original_sentence.lower().split()]) # Convert to lowercase and split into words
        # Each simplified sentence is a hypothesis, as a list of words.
        hypotheses.append(simplified_sentence.lower().split()) # Convert to lowercase and split into words
    else:
        # Optionally, print a message or count skipped rows
        print(f"Skipping row {index} due to invalid data in 'Original' or 'Simplified' columns.")


# Calculate BLEU scores for each sentence pair
bleu_scores = []
# Using SmoothingFunction() to handle cases where there are no matching n-grams
smoothie = SmoothingFunction().method4

# Ensure there are hypotheses to calculate scores
if hypotheses:
    for ref, hyp in zip(references, hypotheses):
        score = sentence_bleu(ref, hyp, smoothing_function=smoothie)
        bleu_scores.append(score)

    # Calculate the average BLEU score
    average_bleu_score = sum(bleu_scores) / len(bleu_scores)

    print(f"Average BLEU score for the dataset: {average_bleu_score}")
else:
    print("No valid simplified sentences found to calculate BLEU score.")

Skipping row 1135 due to invalid data in 'Original' or 'Simplified' columns.
Average BLEU score for the dataset: 0.734554566819249
