In [3]:
import nltk
import numpy as np
import re
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from collections import Counter

# Ensure the necessary NLTK data files are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Input text from the user
lineIn = input("Please enter the text to be summarized: ")

# Percentage of summarization
percentageOfSummary = int(input("Enter the percentage of summarization (e.g., 70 for 70%): "))

# Fetch stop words
stopWords = set(stopwords.words('english'))
# Update stop words
stopWords.update(['"', "'", ':', '(', ')', '[', ']', '{', '}'])

def getTagsForWords(textLn2):
    tokens = word_tokenize(textLn2)
    tagged = pos_tag(tokens)
    return tagged

def remStopWordsOur(lineIn):
    stopWords = {'i','a','and','about','an','are','as','at','be','by','com','for','from','how','in','is','it','not','of','on','or','that','the','this','to','was','what','when','where','who','will','with','the','www','your','is','am','some','you','your','I','A','And','About','An','Are','As','At','Be','By','Com','For','From','How','In','Is','It','Not','Of','On','Or','That','The','This','To','Was','What','When','Where','Who','Will','With','The','Www','Your','Is','Am','Some','You','Your','Was'}
    rmdStopWordsLn = ' '.join(w for w in lineIn.split() if w.lower() not in stopWords)
    return rmdStopWordsLn

def preprocessText(lineIn):
    lineInLower = lineIn.lower()
    lineInRmdSplChars = re.sub(r'[.;,?!:]', ' ', lineInLower)
    return lineInRmdSplChars

def getAllLines(lineIn):
    lineInReplcByPeriod = re.sub(r'[.;,?!]', r'\g<0>§', lineIn).replace('\n', '§')
    linesOriginal = lineInReplcByPeriod.split('§')
    linesOriginal2 = [item for item in linesOriginal if len(item) > 0]
    return linesOriginal2

def getNounPositions(type, tagged):
    nounPosi = {}
    for item in tagged:
        if item[1] == type:
            nounPosi[item[0]] = -1

    for key in nounPosi.keys():
        regExpression = r'\b' + re.escape(key.lower()) + r'\b'
        nounsi = [m.start() for m in re.finditer(regExpression, lineIn.lower())]
        nounPosi[key] = nounsi
    return nounPosi

def getProNounPositions(tagged):
    proNounPosi = {}
    for item in tagged:
        if item[1] == 'PRP':
            proNounPosi[item[0].lower()] = -1

    for key in proNounPosi.keys():
        regExpression = r'\b' + re.escape(key.lower()) + r'\b'
        pronounsi = [m.start() for m in re.finditer(regExpression, lineIn.lower())]
        proNounPosi[key] = pronounsi
    return proNounPosi

def getNearestPreviousNoun(NNP, posiOfPronoun):
    minimumDiff = len(lineIn)
    nearKey = ''
    for keyNNP in NNP.keys():
        for posNoun in NNP[keyNNP]:
            if posiOfPronoun > posNoun:
                if minimumDiff > (posiOfPronoun - posNoun):
                    minimumDiff = posiOfPronoun - posNoun
                    nearKey = keyNNP
    return nearKey

def pronounReplaceWithNearNoun(lineIn, PRP, NNP):
    replacePRP = []
    for key in PRP.keys():
        for pos in PRP[key]:
            nearNoun = getNearestPreviousNoun(NNP, pos)
            replacePRP.append((key, pos, nearNoun))

    replacePRP = sorted(replacePRP, key=lambda x: (-x[1], x[0], x[2]))
    lineInReplacePronn = lineIn
    for prpRep in replacePRP:
        lineInReplacePronn = lineInReplacePronn[:prpRep[1]] + prpRep[2] + lineInReplacePronn[prpRep[1]+len(prpRep[0]):]
    return lineInReplacePronn

def obtainPriorotyOfALine(wtForLine):
    orderdLinesByWt = np.argsort(wtForLine)[::-1]
    priority = [0] * len(wtForLine)

    for i in range(len(wtForLine)):
        priority[orderdLinesByWt[i]] = i

    sentWtAndPriority = []
    for i in range(len(wtForLine)):
        sentWtAndPriority.append((wtForLine[i], priority[i]))

    return sentWtAndPriority

def obtainSummary(lineForCalc, lineForExtract, percentageOfSummary):
    wtForLine = [0] * len(lineForCalc)
    for li in range(len(lineForCalc)):
        wtForLn = 0.0
        preproccdLn2 = preprocessText(lineForCalc[li])
        wInL = preproccdLn2.split()
        for w in wInL:
            w = preprocessText(w)
            if w in freqOfWords:
                wtForLn += freqOfWords[w]
        wtForLine[li] = (wtForLn / len(wInL))

    sentWtAndPriority = obtainPriorotyOfALine(wtForLine)
    numOfLinesInSummary = int((percentageOfSummary * len(lineForCalc)) / 100)
    reducedSummary = []
    for li in range(len(lineForExtract)):
        if sentWtAndPriority[li][1] < numOfLinesInSummary:
            reducedSummary.append(lineForExtract[li])
    return reducedSummary

def removeWeekNmMonthNm(NNP):
    entries = {'january','february','march','april','may','june','july','august','september','october','november','december','monday','tuesday','wednesday','thursday','friday','saturday','sunday'}
    delKeys = [key for key in NNP if key.lower() in entries]

    for key in delKeys:
        del NNP[key]
    return NNP

# Preprocess and analyze the input text
linePreProcessed = preprocessText(lineIn)
rmdStopWordsLn = remStopWordsOur(linePreProcessed)
nt = len(rmdStopWordsLn.split())
freqOfWords = Counter(re.split(r'\s+', re.sub(r'[.,;\-!?]', '', rmdStopWordsLn)))
for word, freq in freqOfWords.items():
    freqOfWords[word] = freq / nt

tagged = getTagsForWords(lineIn)
NNP = getNounPositions('NNP', tagged)
NNP = removeWeekNmMonthNm(NNP)
PRP = getProNounPositions(tagged)

linesOriginal2 = getAllLines(lineIn)
lineInReplacePronn = pronounReplaceWithNearNoun(lineIn, PRP, NNP)
linesReplacedPronn2 = getAllLines(lineInReplacePronn)

# Perform the text summarization without pronoun replacement
reducedSummaryWithoutReplc = obtainSummary(linesOriginal2, linesOriginal2, percentageOfSummary)
# Perform the text summarization with pronoun replacement
reducedSummaryWithReplc = obtainSummary(linesReplacedPronn2, linesOriginal2, percentageOfSummary)

print("Summary without pronoun replacement:")
for line in reducedSummaryWithoutReplc:
    print(line)

print("\nSummary with pronoun replacement:")
for line in reducedSummaryWithReplc:
    print(line)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Please enter the text to be summarized: In 1776, the thirteen American colonies declared their independence from British rule. The Declaration of Independence, drafted by Thomas Jefferson, outlined the colonies' grievances against King George III and asserted their right to self-governance. This historic document was signed by representatives from each of the colonies and marked the beginning of the United States of America. The ensuing Revolutionary War saw the colonies fighting for their freedom, eventually leading to victory and the establishment of a new nation based on principles of liberty and democracy.
Enter the percentage of summarization (e.g., 70 for 70%): 80
Summary without pronoun replacement:
 the thirteen American colonies declared their independence from British rule.
 The Declaration of Independence,
 drafted by Thomas Jefferson,
 outlined the colonies' grievances against King George III and asserted their right to self-governance.
 This historic document was signed by