In [30]:
import re
import heapq
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [31]:

stop_words = set(stopwords.words('english'))
def lsaSummarize(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Create a list of stopwords
    stop_words = stopwords.words('english')

    # Create a list of cleaned sentences
    clean_sentences = [sent.lower() for sent in sentences]

    vectorizer = TfidfVectorizer(stop_words=stop_words)
    tfidf_matrix = vectorizer.fit_transform(clean_sentences)

    svd = TruncatedSVD(n_components=5, random_state=0)
    svd_matrix = svd.fit_transform(tfidf_matrix)

    # Calculate the sentence scores based on the SVD matrix
    scores = np.sum(svd_matrix, axis=1)

    # Sort the sentences by score and select the top 3
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    summary = ' '.join([ranked_sentences[i][1] for i in range(3)])
    
    return summary

In [32]:
def summarizer(text):
    article_text = text.replace("\n", " ")
    article_text = re.sub(r"\[[0-9]*\]", " ", article_text)
    article_text = re.sub(r"\s+", " ", article_text)

    # Remove special characters and digits
    formatted_article_text = re.sub("[^a-zA-Z]", " ", article_text)
    formatted_article_text = re.sub(r"\s+", " ", formatted_article_text)

    sentence_list = nltk.sent_tokenize(article_text)
    stopwords = nltk.corpus.stopwords.words("english")
    word_frequencies = {}
    for word in nltk.word_tokenize(formatted_article_text):
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
    maximum_frequency = max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word] = word_frequencies[word] / maximum_frequency

    sentence_scores = {}
    for sent in sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(" ")) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]

    summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)
    summary = " ".join(summary_sentences)
    return summary

In [33]:
def summarizer_3(text):
    from transformers import T5Tokenizer, T5ForConditionalGeneration

    # Load the tokenizer and model
    tokenizer = T5Tokenizer.from_pretrained('t5-base')
    model = T5ForConditionalGeneration.from_pretrained('t5-base')

    # Encode the input text
    input_ids = tokenizer.encode(text, return_tensors='pt')

    # Generate the summary
    summary_ids = model.generate(input_ids, max_length=50, num_beams=4, length_penalty=2.0, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Print the summary
    return summary


In [34]:
def compare_summaries(text):
    summary_1 = lsaSummarize(text)
    summary_2 = summarizer(text)
    summary_3 = summarizer_3(text)
    summaries = [summary_1, summary_2, summary_3]
    scores = {}
    for i in range(3):
        for j in range(i+1,3):
            # Tokenize the summaries
            summary1_tokens = word_tokenize(summaries[i])
            summary2_tokens = word_tokenize(summaries[j])

            # Calculate the Jaccard similarity score
            intersection = set(summary1_tokens).intersection(summary2_tokens)
            union = set(summary1_tokens).union(summary2_tokens)
            jaccard_score = len(intersection) / len(union)

            # Add the score to the dictionary
            scores[(i,j)] = jaccard_score

    # Find the maximum score
    max_score = max(scores.values())

    # Find the summaries with the maximum score
    best_summaries = []
    for key, value in scores.items():
        if value == max_score:
            best_summaries.append(key[0])
            best_summaries.append(key[1])

# Return the best summary
    return summaries[best_summaries[0]]

In [35]:
text = "The sun had set over the horizon, casting a warm orange glow across the sky. As the night fell, the stars slowly emerged, twinkling like diamonds in the sky. A gentle breeze blew through the trees, rustling the leaves and carrying the sweet scent of flowers. In the distance, a lone wolf howled, adding to the eerie beauty of the night. A stream flowed nearby, its gentle gurgling filling the air with a soothing melody. The ground was covered in a soft layer of dew, making it glisten in the moonlight. A fire crackled nearby, providing warmth and comfort to those gathered around it. It was a peaceful night, a night to be cherished and remembered for years to come."

# Call the function and print the result
summarize = compare_summaries(text)
print(summarize)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


A gentle breeze blew through the trees, rustling the leaves and carrying the sweet scent of flowers. The sun had set over the horizon, casting a warm orange glow across the sky. The ground was covered in a soft layer of dew, making it glisten in the moonlight.


In [40]:
import requests
def translate(summary):    
    translated_text = ""
    length = len(summary)
    parts = [summary[i:i+500] for i in range(0, length, 500)]
    for part in parts:
        url = "https://api.mymemory.translated.net/get"
        params = {
            "q": part,
            "langpair": "en|ne",
        }
        response = requests.get(url, params=params)
        data = response.json()
        translated_text += data["responseData"]["translatedText"]

    return translated_text

In [37]:
text = "The sun had set over the horizon, casting a warm orange glow across the sky. As the night fell, the stars slowly emerged, twinkling like diamonds in the sky. A gentle breeze blew through the trees, rustling the leaves and carrying the sweet scent of flowers. In the distance, a lone wolf howled, adding to the eerie beauty of the night. A stream flowed nearby, its gentle gurgling filling the air with a soothing melody. The ground was covered in a soft layer of dew, making it glisten in the moonlight. A fire crackled nearby, providing warmth and comfort to those gathered around it. It was a peaceful night, a night to be cherished and remembered for years to come."

# Call the function and print the result
summarize = compare_summaries(text)
print(summarize)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


A gentle breeze blew through the trees, rustling the leaves and carrying the sweet scent of flowers. The sun had set over the horizon, casting a warm orange glow across the sky. The ground was covered in a soft layer of dew, making it glisten in the moonlight.


In [41]:
translated = translate(summarize)
print(translated)

मन्द हावा रूखहरूबाट बगिरहेको थियो, पातहरू र फूलहरूको मीठो सुगन्ध बोकेको थियो। सूर्य क्षितिजमा अस्ताएको थियो, आकाशमा न्यानो सुन्तलाको चमक फ्याँकिएको थियो। जमिन शीतको नरम तहले ढाकिएको थियो, चन्द्रमामा चम्किरहेको थियो।
