In [1]:
text = """Football is a family of team sports that involve, to varying degrees, kicking a ball to score a goal. Unqualified, the word football generally means the form of football that is the most popular where the word is used. Sports commonly called football include association football (known as soccer in Australia, Canada, South Africa, the United States, and sometimes in Ireland and New Zealand); Australian rules football; Gaelic football; gridiron football (specifically American football, arena football, or Canadian football); International rules football; rugby league football; and rugby union football. These various forms of football share, to varying degrees, common origins and are known as "football codes".

There are a number of references to traditional, ancient, or prehistoric ball games played in many different parts of the world. Contemporary codes of football can be traced back to the codification of these games at English public schools during the 19th century, itself an outgrowth of medieval football. The expansion and cultural power of the British Empire allowed these rules of football to spread to areas of British influence outside the directly controlled empire. By the end of the 19th century, distinct regional codes were already developing: Gaelic football, for example, deliberately incorporated the rules of local traditional football games in order to maintain their heritage. In 1888, the Football League was founded in England, becoming the first of many professional football associations. During the 20th century, several of the various kinds of football grew to become some of the most popular team sports in the world.
The various codes of football share certain common elements and can be grouped into two main classes of football: carrying codes like American football, Canadian football, Australian football, rugby union and rugby league, where the ball is moved about the field while being held in the hands or thrown, and kicking codes such as association football and Gaelic football, where the ball is moved primarily with the feet, and where handling is strictly limited.
"""
# This is the piece of text which we will be using in this project. We will perform extractive summarization of this text.

In [2]:
len(text) #We use this function to check the length of the given paragraph that is assigned to text.

2121

In [3]:
#spacy for Natural Language Processing.
#STOP_WORDS is a set of default stop words for English language model in SpaCy.
#punctuation is a pre-initialized string which will give the all sets of punctuation
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

In [4]:
# spacy.load is used to load a model. 
# spacy.load('en_core_web_sm') loads the model package en_core_web_sm. 
# This will return a language object nlp containing all components and data needed to process text.
nlp = spacy.load('en_core_web_sm')

In [5]:
# Calling the nlp object on a string of text will return a processed Doc. 
# During processing, spaCy first tokenizes the text, i.e. segments it into words, punctuation and so on.
doc = nlp(text)

In [6]:
# Each Doc consists of individual tokens, and we can iterate over them. 
# Now we will make a list of tokens called tokens.
tokens = [token.text.lower() for token in doc
          if not token.is_stop and
          not token.is_punct and
          token.text !='\n' ]

In [7]:
tokens

['football',
 'family',
 'team',
 'sports',
 'involve',
 'varying',
 'degrees',
 'kicking',
 'ball',
 'score',
 'goal',
 'unqualified',
 'word',
 'football',
 'generally',
 'means',
 'form',
 'football',
 'popular',
 'word',
 'sports',
 'commonly',
 'called',
 'football',
 'include',
 'association',
 'football',
 'known',
 'soccer',
 'australia',
 'canada',
 'south',
 'africa',
 'united',
 'states',
 'ireland',
 'new',
 'zealand',
 'australian',
 'rules',
 'football',
 'gaelic',
 'football',
 'gridiron',
 'football',
 'specifically',
 'american',
 'football',
 'arena',
 'football',
 'canadian',
 'football',
 'international',
 'rules',
 'football',
 'rugby',
 'league',
 'football',
 'rugby',
 'union',
 'football',
 'forms',
 'football',
 'share',
 'varying',
 'degrees',
 'common',
 'origins',
 'known',
 'football',
 'codes',
 '\n\n',
 'number',
 'references',
 'traditional',
 'ancient',
 'prehistoric',
 'ball',
 'games',
 'played',
 'different',
 'parts',
 'world',
 'contemporary',
 'co

In [8]:
# We can see that all the punctuation marks and special characters are included in the tokens. 
# Now we will remove them.
tokens1=[]
stopwords = list(STOP_WORDS)
allowed_pos = ['ADJ', 'PROPN', 'VERB', 'NOUN']
for token in  doc:
  if token.text in stopwords or token.text in punctuation:
    continue
  if token.pos_ in allowed_pos:
    tokens1.append(token.text)

In [9]:
tokens1

['Football',
 'family',
 'team',
 'sports',
 'involve',
 'varying',
 'degrees',
 'kicking',
 'ball',
 'score',
 'goal',
 'Unqualified',
 'word',
 'football',
 'means',
 'form',
 'football',
 'popular',
 'word',
 'Sports',
 'called',
 'football',
 'include',
 'association',
 'football',
 'known',
 'soccer',
 'Australia',
 'Canada',
 'South',
 'Africa',
 'United',
 'States',
 'Ireland',
 'New',
 'Zealand',
 'Australian',
 'rules',
 'football',
 'Gaelic',
 'football',
 'gridiron',
 'football',
 'American',
 'football',
 'arena',
 'football',
 'Canadian',
 'football',
 'International',
 'rules',
 'football',
 'rugby',
 'league',
 'football',
 'rugby',
 'union',
 'football',
 'forms',
 'football',
 'share',
 'varying',
 'degrees',
 'common',
 'origins',
 'known',
 'football',
 'codes',
 'number',
 'references',
 'traditional',
 'ancient',
 'prehistoric',
 'ball',
 'games',
 'played',
 'different',
 'parts',
 'world',
 'Contemporary',
 'codes',
 'football',
 'traced',
 'codification',
 'game

In [10]:
# Using Counter from the collections module is efficient for counting the frequency of elements, which can be helpful in text summarization for tasks.
# Like identifying the most common words or phrases. 
#Ensure you preprocess the text (e.g., tokenization) before applying it to get meaningful results.
from collections import Counter 

In [11]:
word_freq = Counter(tokens1) #number of occurences in the text.

In [12]:
word_freq

Counter({'football': 29,
         'codes': 6,
         'ball': 4,
         'rules': 4,
         'rugby': 4,
         'Gaelic': 3,
         'games': 3,
         'century': 3,
         'Football': 2,
         'team': 2,
         'sports': 2,
         'varying': 2,
         'degrees': 2,
         'kicking': 2,
         'word': 2,
         'popular': 2,
         'association': 2,
         'known': 2,
         'Australian': 2,
         'American': 2,
         'Canadian': 2,
         'league': 2,
         'union': 2,
         'share': 2,
         'common': 2,
         'traditional': 2,
         'world': 2,
         '19th': 2,
         'British': 2,
         'moved': 2,
         'family': 1,
         'involve': 1,
         'score': 1,
         'goal': 1,
         'Unqualified': 1,
         'means': 1,
         'form': 1,
         'Sports': 1,
         'called': 1,
         'include': 1,
         'soccer': 1,
         'Australia': 1,
         'Canada': 1,
         'South': 1,
         'Africa'

In [13]:
max_freq = max(word_freq.values()) # We use this function to find the max frequency of the repeating word.

In [14]:
max_freq

29

In [15]:
for word in word_freq.keys():
  word_freq[word] = word_freq[word]/max_freq #calculates the nominal value of the words.

In [16]:
word_freq

Counter({'football': 1.0,
         'codes': 0.20689655172413793,
         'ball': 0.13793103448275862,
         'rules': 0.13793103448275862,
         'rugby': 0.13793103448275862,
         'Gaelic': 0.10344827586206896,
         'games': 0.10344827586206896,
         'century': 0.10344827586206896,
         'Football': 0.06896551724137931,
         'team': 0.06896551724137931,
         'sports': 0.06896551724137931,
         'varying': 0.06896551724137931,
         'degrees': 0.06896551724137931,
         'kicking': 0.06896551724137931,
         'word': 0.06896551724137931,
         'popular': 0.06896551724137931,
         'association': 0.06896551724137931,
         'known': 0.06896551724137931,
         'Australian': 0.06896551724137931,
         'American': 0.06896551724137931,
         'Canadian': 0.06896551724137931,
         'league': 0.06896551724137931,
         'union': 0.06896551724137931,
         'share': 0.06896551724137931,
         'common': 0.06896551724137931,
       

In [17]:
# Using sent.text for sent in doc.sents is a good way to extract sentences from a text when using a library like spaCy. 
# Ensure the text is properly preprocessed (e.g., punctuation handled and language model loaded) for accurate sentence segmentation. 
# Additionally, consider handling edge cases, such as abbreviations or unusual sentence structures, which might affect sentence splitting.
sent_token = [sent.text for sent in doc.sents]

In [18]:
sent_token #List. Where each element represents a sentence from the text.

['Football is a family of team sports that involve, to varying degrees, kicking a ball to score a goal.',
 'Unqualified, the word football generally means the form of football that is the most popular where the word is used.',
 'Sports commonly called football include association football (known as soccer in Australia, Canada, South Africa, the United States, and sometimes in Ireland and New Zealand); Australian rules football; Gaelic football; gridiron football (specifically American football, arena football, or Canadian football); International rules football; rugby league football; and rugby union football.',
 'These various forms of football share, to varying degrees, common origins and are known as "football codes".\n\n',
 'There are a number of references to traditional, ancient, or prehistoric ball games played in many different parts of the world.',
 'Contemporary codes of football can be traced back to the codification of these games at English public schools during the 19th c

In [19]:
sent_score = {} # Empty dictionary which will store the scores of each sentences.
for sent in sent_token: #Iterates and contains each sentence in the token.
  for word in sent.split():
    if word.lower() in word_freq.keys():
      if sent not in sent_score.keys():
        sent_score[sent] = word_freq[word.lower()]
      else:
        sent_score[sent] += word_freq[word.lower()]
    print(word)
# Calculate and store scores for each sentence based on word frequencies, iterating through sentences and their words.

Football
is
a
family
of
team
sports
that
involve,
to
varying
degrees,
kicking
a
ball
to
score
a
goal.
Unqualified,
the
word
football
generally
means
the
form
of
football
that
is
the
most
popular
where
the
word
is
used.
Sports
commonly
called
football
include
association
football
(known
as
soccer
in
Australia,
Canada,
South
Africa,
the
United
States,
and
sometimes
in
Ireland
and
New
Zealand);
Australian
rules
football;
Gaelic
football;
gridiron
football
(specifically
American
football,
arena
football,
or
Canadian
football);
International
rules
football;
rugby
league
football;
and
rugby
union
football.
These
various
forms
of
football
share,
to
varying
degrees,
common
origins
and
are
known
as
"football
codes".
There
are
a
number
of
references
to
traditional,
ancient,
or
prehistoric
ball
games
played
in
many
different
parts
of
the
world.
Contemporary
codes
of
football
can
be
traced
back
to
the
codification
of
these
games
at
English
public
schools
during
the
19th
century,
itself
an
outgrowt

In [20]:
sent_score # `sent_score` stores the cumulative frequency-based scores of each sentence, calculated using word frequencies from `word_freq`.

{'Football is a family of team sports that involve, to varying degrees, kicking a ball to score a goal.': 1.482758620689655,
 'Unqualified, the word football generally means the form of football that is the most popular where the word is used.': 2.275862068965518,
 'Sports commonly called football include association football (known as soccer in Australia, Canada, South Africa, the United States, and sometimes in Ireland and New Zealand); Australian rules football; Gaelic football; gridiron football (specifically American football, arena football, or Canadian football); International rules football; rugby league football; and rugby union football.': 3.999999999999999,
 'These various forms of football share, to varying degrees, common origins and are known as "football codes".\n\n': 1.2758620689655171,
 'There are a number of references to traditional, ancient, or prehistoric ball games played in many different parts of the world.': 0.4482758620689655,
 'Contemporary codes of football 

In [21]:
# Import pandas as pd imports the pandas library, commonly used for data manipulation and analysis, with pd as its alias.
import pandas as pd 

In [22]:
# This code converts the `sent_score` dictionary into a pandas DataFrame with two columns: 'Sentence' (keys) and 'Score' (values).
pd.DataFrame(list(sent_score.items()), columns=['Sentence', 'Score']) 

Unnamed: 0,Sentence,Score
0,Football is a family of team sports that invol...,1.482759
1,"Unqualified, the word football generally means...",2.275862
2,Sports commonly called football include associ...,4.0
3,"These various forms of football share, to vary...",1.275862
4,There are a number of references to traditiona...,0.448276
5,Contemporary codes of football can be traced b...,1.586207
6,The expansion and cultural power of the Britis...,1.448276
7,"By the end of the 19th century, distinct regio...",1.827586
8,"In 1888, the Football League was founded in En...",2.137931
9,"During the 20th century, several of the variou...",1.310345


In [23]:
# `from heapq import nlargest` imports the `nlargest` function, which is used to retrieve the top `n` largest elements from a dataset efficiently.
from heapq import nlargest 

In [24]:
# Select the top `num_sentences` (3 in this case) sentences with the highest scores from `sent_score` using the `nlargest` function.
num_sentences = 3
n = nlargest(num_sentences, sent_score, key=sent_score.get) #Executes top 3 sentences based on their score.

In [25]:
n

['The various codes of football share certain common elements and can be grouped into two main classes of football: carrying codes like American football, Canadian football, Australian football, rugby union and rugby league, where the ball is moved about the field while being held in the hands or thrown, and kicking codes such as association football and Gaelic football, where the ball is moved primarily with the feet, and where handling is strictly limited.\n',
 'Sports commonly called football include association football (known as soccer in Australia, Canada, South Africa, the United States, and sometimes in Ireland and New Zealand); Australian rules football; Gaelic football; gridiron football (specifically American football, arena football, or Canadian football); International rules football; rugby league football; and rugby union football.',
 'Unqualified, the word football generally means the form of football that is the most popular where the word is used.']

In [26]:
" ".join(n) # Performs extractive text summarization.

'The various codes of football share certain common elements and can be grouped into two main classes of football: carrying codes like American football, Canadian football, Australian football, rugby union and rugby league, where the ball is moved about the field while being held in the hands or thrown, and kicking codes such as association football and Gaelic football, where the ball is moved primarily with the feet, and where handling is strictly limited.\n Sports commonly called football include association football (known as soccer in Australia, Canada, South Africa, the United States, and sometimes in Ireland and New Zealand); Australian rules football; Gaelic football; gridiron football (specifically American football, arena football, or Canadian football); International rules football; rugby league football; and rugby union football. Unqualified, the word football generally means the form of football that is the most popular where the word is used.'

In [27]:
# tkinter and components: Imports modules for creating a GUI (tkinter, scrolledtext, messagebox) to enable text input/output and user interaction.
# Summarization code: Imports necessary libraries for natural language processing (spacy, STOP_WORDS, punctuation, Counter, nlargest).
# Done to process text and calculate summaries efficiently.

import tkinter as tk
from tkinter import scrolledtext
from tkinter import messagebox
from tkinter import END
#Importing the summarization code
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest

In [None]:
# Install required packages
!apt-get install xvfb  # Install X Virtual Frame Buffer
import os
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
import tkinter as tk
from tkinter import scrolledtext

# Load spaCy language model
nlp = spacy.load("en_core_web_sm")
def summarize():
    """Summarize the input text and display the result."""
    text = input_box.get("1.0", tk.END).strip()  # Get text from input box
    if not text:
        result_box.delete("1.0", tk.END)
        result_box.insert(tk.END, "Please enter some text to summarize.")
        return
    # Tokenization and text processing
    doc = nlp(text)
    tokens = [token.text for token in doc]
    
    # Calculate word frequencies, excluding stop words and punctuation
    word_frequencies = {}
    for word in doc:
        if word.text.lower() not in STOP_WORDS and word.text not in punctuation:
            word_frequencies[word.text] = word_frequencies.get(word.text, 0) + 1
    # Normalize word frequencies
    max_frequency = max(word_frequencies.values(), default=1)
    word_frequencies = {word: freq / max_frequency for word, freq in word_frequencies.items()}
    # Score sentences based on word importance
    sentence_scores = {}
    for sent in doc.sents:
        for word in sent:
            if word.text in word_frequencies:
                sentence_scores[sent] = sentence_scores.get(sent, 0) + word_frequencies[word.text]
    # Extract the top sentences for the summary
    summary_length = max(1, int(len(list(doc.sents)) * 0.3))  # Use 30% of sentences
    summarized_sentences = nlargest(summary_length, sentence_scores, key=sentence_scores.get)
    # Combine the selected sentences into the final summary
    final_summary = " ".join([sent.text for sent in summarized_sentences])
    # Display the summary in the result box
    result_box.delete("1.0", tk.END)
    result_box.insert(tk.END, final_summary)
# Initialize the main window
root = tk.Tk()
root.title("Text Summarizer")
# Input box for text
input_box = scrolledtext.ScrolledText(root, width=50, height=10, wrap=tk.WORD)
input_box.pack(pady=10)
# Button to trigger summarization
summarize_button = tk.Button(root, text="Summarize", command=summarize)
summarize_button.pack(pady=10)
# Result box for output
result_box = scrolledtext.ScrolledText(root, width=50, height=5, wrap=tk.WORD)
result_box.pack(pady=10)

#Runs application
root.mainloop()

'apt-get' is not recognized as an internal or external command,
operable program or batch file.
