In [5]:
# Install all required packages
!pip install torch transformers datasets evaluate numpy pandas nltk sacremoses sacrebleu

# Import necessary libraries
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import nltk
from nltk.tokenize import sent_tokenize
import re

# Download NLTK data
nltk.download('punkt')

class TextSimplificationSystem:
    def __init__(self, bert_model, tokenizer):
        self.bert_model = bert_model
        self.tokenizer = tokenizer
        self.complex_word_threshold = 0.7

    def split_complex_sentences(self, text):
        """Split complex sentences into simpler ones"""
        sentences = sent_tokenize(text)
        simplified_sentences = []

        for sentence in sentences:
            # Split on certain conjunctions and relative pronouns
            parts = re.split(r',\s*which\s+|,\s*and\s+|;\s*', sentence)
            simplified_sentences.extend([p.strip() for p in parts if p.strip()])

        return simplified_sentences

    def identify_complex_words(self, sentence):
        """Identify potentially complex words in the sentence"""
        tokens = self.tokenizer.tokenize(sentence)
        complex_words = []

        for token in tokens:
            if len(token) > 8 and not token.startswith('##'):  # Simple length-based heuristic
                complex_words.append(token)

        return complex_words

    def get_simpler_alternatives(self, word, context):
        """Generate simpler alternatives for a complex word"""
        masked_context = context.replace(word, self.tokenizer.mask_token)
        inputs = self.tokenizer(masked_context, return_tensors="pt")

        with torch.no_grad():
            outputs = self.bert_model(**inputs)
            predictions = outputs.logits

        mask_idx = torch.where(inputs["input_ids"][0] == self.tokenizer.mask_token_id)[0]

        top_k = 5
        probs, indices = torch.topk(predictions[0, mask_idx], top_k)

        alternatives = [self.tokenizer.decode([idx]) for idx in indices[0]]
        filtered_alternatives = [alt for alt in alternatives
                              if len(alt) < len(word) and alt.isalpha()]

        return filtered_alternatives[0] if filtered_alternatives else word

    def simplify(self, text):
        """Main simplification pipeline"""
        simple_sentences = self.split_complex_sentences(text)

        simplified_sentences = []
        for sentence in simple_sentences:
            complex_words = self.identify_complex_words(sentence)

            simplified_sentence = sentence
            for word in complex_words:
                simpler_word = self.get_simpler_alternatives(word, sentence)
                if simpler_word != word:
                    simplified_sentence = simplified_sentence.replace(word, simpler_word)

            simplified_sentences.append(simplified_sentence)

        return ' '.join(simplified_sentences)

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased')

# Create simplification system
simplification_system = TextSimplificationSystem(
    bert_model=bert_model,
    tokenizer=tokenizer
)

# Test the system with multiple examples
test_texts = [
    "The economic outlook, which has been fluctuating due to various global factors, remains uncertain for the foreseeable future.",
    "The implementation of sophisticated algorithms necessitates substantial computational resources.",
    "The quantum mechanical interpretation demonstrates extraordinary complexity in theoretical physics."
]

for text in test_texts:
    print("\nOriginal:", text)
    simplified_text = simplification_system.simplify(text)
    print("Simplified:", simplified_text)

    # Calculate metrics
    references = [text]  # Using original as reference for demonstration

    def calculate_sari(original, simplified, references):
        from evaluate import load
        sari_metric = load('sari')
        results = sari_metric.compute(
            sources=[original],
            predictions=[simplified],
            references=[references]
        )
        return results['sari']

    def calculate_bleu(original, simplified):
        from evaluate import load
        bleu_metric = load('bleu')
        results = bleu_metric.compute(
            predictions=[simplified.split()],
            references=[[original.split()]]
        )
        return results['bleu']

    try:
        sari_score = calculate_sari(text, simplified_text, references)
        bleu_score = calculate_bleu(text, simplified_text)
        print(f"SARI score: {sari_score:.2f}")
        print(f"BLEU score: {bleu_score:.2f}")
    except Exception as e:
        print(f"Error calculating metrics: {e}")

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Installing collected pa

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Original: The economic outlook, which has been fluctuating due to various global factors, remains uncertain for the foreseeable future.
Simplified: The economic outlook has been fluctuating due to various global factors, remains stable for the foreseeable future.


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Error calculating metrics: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: ['The', 'economic', 'outlook', ..., 'the', 'foreseeable', 'future.'],
Input references: [['The', 'economic', 'outlook,', 'which', 'has', 'been', 'fluctuating', 'due', 'to', 'various', 'global', 'factors,', 'remains', 'uncertain', 'for', 'the', 'foreseeable', 'future.']]

Original: The implementation of sophisticated algorithms necessitates substantial computational resources.
Simplified: The development of such software necessitates large computing effort.
Error calculating metrics: Predictions and/or references don't match the expected format.
Expected format:
Feature opt

In [6]:
# Install required packages
!pip install -q transformers datasets nltk stanza spacy

import numpy as np
import pandas as pd
import nltk
import spacy
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import wordnet as wn
from collections import defaultdict

# Download required NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

class SimplificationPipeline:
    def __init__(self):
        # Initialize BERT model for word substitution
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased')
        self.nlp = spacy.load('en_core_web_sm')

    def simplify_text(self, text):
        """Main simplification pipeline"""
        # Split into sentences
        sentences = self.split_sentences(text)

        # Process each sentence
        simplified_sentences = []
        for sentence in sentences:
            # Split complex sentences
            split_sent = self.split_complex_sentence(sentence)

            # Delete unnecessary information
            cleaned_sent = self.delete_unnecessary_info(split_sent)

            # Identify and replace complex words
            simplified_sent = self.simplify_complex_words(cleaned_sent)

            # Reorder if necessary
            reordered_sent = self.reorder_sentence(simplified_sent)

            simplified_sentences.extend(reordered_sent)

        return ' '.join(simplified_sentences)

    def split_sentences(self, text):
        """Split text into sentences"""
        return sent_tokenize(text)

    def split_complex_sentence(self, sentence):
        """Split complex sentences into simpler ones"""
        doc = self.nlp(sentence)

        # Initialize list to store split sentences
        split_sentences = []
        current_sentence = []

        for token in doc:
            current_sentence.append(token.text)

            # Split on coordinating conjunctions between clauses
            if token.dep_ == 'cc' and token.head.pos_ == 'VERB':
                split_sentences.append(' '.join(current_sentence))
                current_sentence = []

        if current_sentence:
            split_sentences.append(' '.join(current_sentence))

        return split_sentences if split_sentences else [sentence]

    def delete_unnecessary_info(self, sentences):
        """Remove non-essential information"""
        cleaned_sentences = []

        for sentence in sentences:
            doc = self.nlp(sentence)

            # Remove parenthetical expressions
            cleaned_text = ' '.join([token.text for token in doc
                                  if not any(ancestor.dep_ == 'appos'
                                           for ancestor in token.ancestors)])

            cleaned_sentences.append(cleaned_text)

        return cleaned_sentences

    def simplify_complex_words(self, sentences):
        """Identify and replace complex words"""
        simplified_sentences = []

        for sentence in sentences:
            doc = self.nlp(sentence)
            simplified_words = []

            for token in doc:
                # Skip simple words, punctuation, and proper nouns
                if (len(token.text) < 7 or token.is_punct or token.is_stop
                    or token.pos_ == 'PROPN'):
                    simplified_words.append(token.text)
                    continue

                # Find simpler alternative
                simple_word = self.find_simpler_word(token.text, sentence)
                simplified_words.append(simple_word or token.text)

            simplified_sentences.append(' '.join(simplified_words))

        return simplified_sentences

    def find_simpler_word(self, word, context):
        """Find a simpler alternative for a complex word"""
        # Mask the word in context
        masked_text = context.replace(word, self.tokenizer.mask_token)

        # Get model predictions
        inputs = self.tokenizer(masked_text, return_tensors='pt')
        with torch.no_grad():
            outputs = self.model(**inputs)

        # Get top 5 predictions
        mask_idx = torch.where(inputs['input_ids'][0] == self.tokenizer.mask_token_id)[0]
        if len(mask_idx) == 0:
            return None

        probs = outputs.logits[0, mask_idx].softmax(dim=-1)
        top_k = torch.topk(probs, 5, dim=-1)

        # Select the shortest word from top predictions that's not the same as original
        candidates = [self.tokenizer.decode([token_id]) for token_id in top_k.indices[0]]
        candidates = [c for c in candidates if c.lower() != word.lower() and len(c) < len(word)]

        return candidates[0] if candidates else None

    def reorder_sentence(self, sentences):
        """Reorder sentences in logical order"""
        # For now, just return the sentences as is
        # Future: Implement cause-effect ordering
        return sentences

class Evaluator:
    def calculate_sari(self, original, simplified, references):
        """Calculate SARI score"""
        # Tokenize all inputs
        orig_tokens = set(word_tokenize(original.lower()))
        simp_tokens = set(word_tokenize(simplified.lower()))
        ref_tokens = [set(word_tokenize(ref.lower())) for ref in references]

        # Calculate add score
        add_score = self._calculate_add_score(orig_tokens, simp_tokens, ref_tokens)

        # Calculate keep score
        keep_score = self._calculate_keep_score(orig_tokens, simp_tokens, ref_tokens)

        # Calculate delete score
        delete_score = self._calculate_delete_score(orig_tokens, simp_tokens, ref_tokens)

        # Return SARI score
        return (add_score + keep_score + delete_score) / 3

    def _calculate_add_score(self, orig_tokens, simp_tokens, ref_tokens):
        """Calculate addition score"""
        added = simp_tokens - orig_tokens
        ref_added = [ref - orig_tokens for ref in ref_tokens]

        precision = len([t for t in added if any(t in ref_add for ref_add in ref_added)]) / (len(added) or 1)
        recall = np.mean([len(added & ref_add) / (len(ref_add) or 1) for ref_add in ref_added])

        return (precision + recall) / 2 if precision + recall > 0 else 0

    def _calculate_keep_score(self, orig_tokens, simp_tokens, ref_tokens):
        """Calculate keep score"""
        kept = orig_tokens & simp_tokens
        ref_kept = [orig_tokens & ref for ref in ref_tokens]

        precision = len([t for t in kept if any(t in ref_k for ref_k in ref_kept)]) / (len(kept) or 1)
        recall = np.mean([len(kept & ref_k) / (len(ref_k) or 1) for ref_k in ref_kept])

        return (precision + recall) / 2 if precision + recall > 0 else 0

    def _calculate_delete_score(self, orig_tokens, simp_tokens, ref_tokens):
        """Calculate deletion score"""
        deleted = orig_tokens - simp_tokens
        ref_deleted = [orig_tokens - ref for ref in ref_tokens]

        precision = len([t for t in deleted if any(t in ref_del for ref_del in ref_deleted)]) / (len(deleted) or 1)
        recall = np.mean([len(deleted & ref_del) / (len(ref_del) or 1) for ref_del in ref_deleted])

        return (precision + recall) / 2 if precision + recall > 0 else 0

# Example usage
def main():
    # Initialize the pipeline
    simplifier = SimplificationPipeline()
    evaluator = Evaluator()

    # Example text
    text = "The economic outlook, which has been fluctuating due to various global factors, remains uncertain for the foreseeable future."

    # Simplify the text
    simplified = simplifier.simplify_text(text)

    # Example references for evaluation
    references = [
        "The economic outlook is uncertain because of global factors.",
        "Global factors make the economic future uncertain."
    ]

    # Calculate SARI score
    sari_score = evaluator.calculate_sari(text, simplified, references)

    print("Original:", text)
    print("Simplified:", simplified)
    print("SARI Score:", sari_score)

if __name__ == "__main__":
    main()

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25h

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceCl

Original: The economic outlook, which has been fluctuating due to various global factors, remains uncertain for the foreseeable future.
Simplified: The current future , which has been uncertain due to various global events , is stable for the near future .
SARI Score: 0.3920787545787546
