In [2]:
import re
import nltk
import math
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords


nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def tokenize_sentence(text):
    return sent_tokenize(text)

def tokenize_word(text):
    words = [word for word in word_tokenize(text) if not is_stop_word(word)]
    return words

def lemmatize(word):
    lemmatizer=WordNetLemmatizer()
    return lemmatizer.lemmatize(word)

def is_stop_word(word):
    stop_words=set(stopwords.words('english'))
    return word.lower() in stop_words

In [4]:

def calculate_tf(word,sentence):
    words=tokenize_word(sentence)
    return words.count(word)/len(words)

def calculate_idf(word,sentences):
    no=sum(1 for sentence in sentences if word in tokenize_word(sentence))
    return math.log(len(sentences)/(no+1))

def calculate_tf_idf(sentence,sentences):
    words=set(tokenize_word(sentence))
    tf_idf_scores=0
    for word in words:
        tf=calculate_tf(word,sentence)
        idf=calculate_idf(word,sentences)
        tf_idf_scores+=tf*idf
    return tf_idf_scores

In [5]:
def find_max_sentence(scores):
    max_score=float('-inf')
    max_sentence=None
    for sentence,score in scores.items():
        if(score>max_score):
            max_score=score
            max_sentence=sentence
    return max_sentence

def n_largest(scores,n):
    sentences=[]
    for i in range(n):
        max_sentence=find_max_sentence(scores)
        sentences.append(max_sentence)
        del scores[max_sentence]
    return sentences

def summarize_text(text,length):
    sentences=tokenize_sentence(text)
    sentence_scores={sentence:calculate_tf_idf(sentence,sentences) for sentence in sentences}
    selected_sentences=n_largest(sentence_scores,length)
    summary=' '.join(selected_sentences)
    return summary

In [6]:
text = "Natural language processing (NLP) is an interdisciplinary subfield of computer science and linguistics.It is primarily concerned with giving computers the ability to support and manipulate human language. It involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic (i.e. statistical and, most recently, neural network-based) machine learning approaches. The goal is a computer capable of understanding the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.Natural language processing has its roots in the 1940s.[1] Already in 1940, Alan Turing published an article titled Computing Machinery and Intelligence which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence"
summary = summarize_text(text,5)
print(summary)

[1] Already in 1940, Alan Turing published an article titled Computing Machinery and Intelligence which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.Natural language processing has its roots in the 1940s. Natural language processing (NLP) is an interdisciplinary subfield of computer science and linguistics.It is primarily concerned with giving computers the ability to support and manipulate human language. It involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic (i.e. The goal is a computer capable of understanding the contents of documents, including the contextual nuances of the language within them.
