In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [14]:
# Prompt the user to enter the text to summarize 
text = input("Enter the text to summarize: ")

Enter the text to summarize: As of May 2023, the world has been recovering from the global COVID-19 pandemic that started in 2019. Vaccination efforts have been widespread, leading to a significant reduction in cases and deaths in many countries. However, some regions are still facing challenges in containing the virus and administering vaccines.  In terms of technology, artificial intelligence and machine learning continue to advance, impacting various sectors such as healthcare, finance, and transportation. The development of self-driving cars and the integration of AI into everyday life are becoming more prevalent.  Climate change remains a pressing issue, with continued efforts to address it through international agreements and initiatives. The transition to renewable energy sources and the reduction of carbon emissions are key focuses in combating climate change.  In geopolitics, there have been ongoing shifts in global power dynamics. Economic tensions between major powers, such 

In [15]:
# Prompt the user to enter the desired percentage of summary length
percentage = float(input("Enter the percentage of summary length (0-100): "))

Enter the percentage of summary length (0-100): 30


In [25]:
# Tokenizing the sentences
sentences = sent_tokenize(text)
print("\nTokenized Sentences:")
print(sentences[0])


Tokenized Sentences:
As of May 2023, the world has been recovering from the global COVID-19 pandemic that started in 2019.


In [26]:
# Clean the sentences by removing punctuation and converting to lowercase
cleaned_sentences = [re.sub(r'[^\w\s]', '', sentence).lower() for sentence in sentences]
print("\nCleaned Sentences:")
print(cleaned_sentences[0])


Cleaned Sentences:
as of may 2023 the world has been recovering from the global covid19 pandemic that started in 2019


In [28]:
# Create a TF-IDF vectorizer and fit it on the cleaned sentences
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
tfidf_vectorizer.fit(cleaned_sentences)

TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...])

In [33]:
# Perform part-of-speech tagging on the cleaned sentences
tagged_sentences = [nltk.pos_tag(word_tokenize(sentence)) for sentence in cleaned_sentences]
print("\nTagged Sentences:")
for i, tagged_sentence in enumerate(tagged_sentences):
    print(f"Sentence {i+1}:")
    print(tagged_sentence)


Tagged Sentences:
Sentence 1:
[('as', 'IN'), ('of', 'IN'), ('may', 'MD'), ('2023', 'VB'), ('the', 'DT'), ('world', 'NN'), ('has', 'VBZ'), ('been', 'VBN'), ('recovering', 'VBG'), ('from', 'IN'), ('the', 'DT'), ('global', 'JJ'), ('covid19', 'NN'), ('pandemic', 'NN'), ('that', 'WDT'), ('started', 'VBD'), ('in', 'IN'), ('2019', 'CD')]
Sentence 2:
[('vaccination', 'NN'), ('efforts', 'NNS'), ('have', 'VBP'), ('been', 'VBN'), ('widespread', 'JJ'), ('leading', 'VBG'), ('to', 'TO'), ('a', 'DT'), ('significant', 'JJ'), ('reduction', 'NN'), ('in', 'IN'), ('cases', 'NNS'), ('and', 'CC'), ('deaths', 'NNS'), ('in', 'IN'), ('many', 'JJ'), ('countries', 'NNS')]
Sentence 3:
[('however', 'RB'), ('some', 'DT'), ('regions', 'NNS'), ('are', 'VBP'), ('still', 'RB'), ('facing', 'VBG'), ('challenges', 'NNS'), ('in', 'IN'), ('containing', 'VBG'), ('the', 'DT'), ('virus', 'NN'), ('and', 'CC'), ('administering', 'VBG'), ('vaccines', 'NNS')]
Sentence 4:
[('in', 'IN'), ('terms', 'NNS'), ('of', 'IN'), ('technology

In [34]:
sentence_scores = {}
total_word_count = sum(len(sentence.split()) for sentence in sentences)
print("\nTotal Word Count:", total_word_count)


Total Word Count: 253


In [35]:
# Calculate the number of words to include in the summary based on the selected percentage
num_words = int((percentage / 100) * total_word_count)
print("Number of Words in Summary:", num_words)

Number of Words in Summary: 75


In [36]:
# Calculate sentence scores based on TF-IDF scores and part-of-speech information
for i, sentence in enumerate(cleaned_sentences):
    sentence_tokens = word_tokenize(sentence)
    sentence_tfidf_scores = [tfidf_vectorizer.idf_[tfidf_vectorizer.vocabulary_.get(token, 0)] 
                             for token in sentence_tokens]
    pos_scores = [1 if tag.startswith('NN') or tag.startswith('VB') else 0 
                  for _, tag in tagged_sentences[i]]
    sentence_scores[sentence] = sum(sentence_tfidf_scores) * sum(pos_scores)

print("\nSentence Scores:")
for sentence, score in sentence_scores.items():
    print(sentence, ":", score)


Sentence Scores:
as of may 2023 the world has been recovering from the global covid19 pandemic that started in 2019 : 434.6506836925514
vaccination efforts have been widespread leading to a significant reduction in cases and deaths in many countries : 455.60956734420046
however some regions are still facing challenges in containing the virus and administering vaccines : 341.65373180327623
in terms of technology artificial intelligence and machine learning continue to advance impacting various sectors such as healthcare finance and transportation : 762.8359210393012
the development of selfdriving cars and the integration of ai into everyday life are becoming more prevalent : 415.56032880359226
climate change remains a pressing issue with continued efforts to address it through international agreements and initiatives : 401.74256121886845
the transition to renewable energy sources and the reduction of carbon emissions are key focuses in combating climate change : 687.5159276111085
in ge

In [37]:
# Select the most important sentences for the summary based on the number of words
summary_sentences = []
current_word_count = 0
for sentence in sentences:
    if current_word_count + len(sentence.split()) <= num_words:
        summary_sentences.append(sentence)
        current_word_count += len(sentence.split())
    else:
        break

summary = ' '.join(summary_sentences)

print("\nSummary:")
print(summary)


Summary:
As of May 2023, the world has been recovering from the global COVID-19 pandemic that started in 2019. Vaccination efforts have been widespread, leading to a significant reduction in cases and deaths in many countries. However, some regions are still facing challenges in containing the virus and administering vaccines. In terms of technology, artificial intelligence and machine learning continue to advance, impacting various sectors such as healthcare, finance, and transportation.
