In [1]:
!pip install sumy
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.lsa import LsaSummarizer

nltk.download('stopwords')
nltk.download('punkt')

def calculate_word_frequencies(text):
    stopwords_set = set(stopwords.words("english"))
    words = word_tokenize(text)
    freqTable = {}

    for word in words:
        word = word.lower()
        if word in stopwords_set:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable





[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def calculate_sentence_scores(text, freqTable):
    sentences = sent_tokenize(text)
    sentenceValue = {}

    for sentence in sentences:
        for word, freq in freqTable.items():
            if word in sentence.lower():
                if sentence in sentenceValue:
                    sentenceValue[sentence] += freq
                else:
                    sentenceValue[sentence] = freq

    return sentenceValue

In [3]:
def calculate_average_score(sentenceValue):
    sumValues = sum(sentenceValue.values())
    average = int(sumValues / len(sentenceValue))
    return average

def generate_summary(sentences, sentenceValue, average):
    summary = ''
    for sentence in sentences:
        if sentence in sentenceValue and sentenceValue[sentence] > (1.2 * average):
            summary += sentence
    return summary


In [4]:
def solve(text):
    freqTable = calculate_word_frequencies(text)
    sentenceValue = calculate_sentence_scores(text, freqTable)
    average = calculate_average_score(sentenceValue)
    sentences = sent_tokenize(text)
    summary = generate_summary(sentences, sentenceValue, average)

    return summary

with open('/content/CleanedTextWords.txt', 'r') as file:
    text = file.read()
    result = solve(text)
    print(result)

with open('/content/solve_method_result.txt', 'w') as file:
    file.write(result)





In [5]:
def sumy_method(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, 2)
    summary_sentences = [str(sentence) for sentence in summary]
    final_sentence = ' '.join(summary_sentences)
    return final_sentence

with open('/content/CleanedTextWords.txt', 'r') as file:
    text = file.read()
    result = sumy_method(text)
    print(result)

with open('/content/sumy_method_result.txt', 'w') as file:
    file.write(result)


Chula ac th sentences to be included in the summary Text summarization can be divided into 2 approaches The first approach is the extractive summarization which relies on a method for extracting words and searching for keywords from the original document The second approach is the abstractive summarization which analyzes words by linguistic principles with transcription or interpretation from the original document This approach implies more effective and accurate summary than the extractive methods However with the lack of Thai corpus we chose to apply an extractive summarization method for Thai text summarization This research focused on the sentence extraction function based on keyword score calculation then selecting important sentences based on the Generic Sentence Relevance score 𝑅𝑅𝑟𝑟𝑟𝑟𝑟𝑟 and both terms are non negative as shown in Eq 2 and Eq 3 4 𝑖𝑖 1 𝑤𝑤𝑤𝑤𝑤𝑤𝑤𝑤ℎ𝑡𝑡 𝐻𝐻𝑖𝑖 𝑛𝑛𝑞𝑞 1 𝐻𝐻𝑖𝑖𝑖𝑖 𝑟𝑟 𝑝𝑝 1 𝑛𝑛𝑞𝑞 1 𝐻𝐻𝑝𝑝𝑝𝑝 5 The 𝑤𝑤𝑤𝑤𝑤𝑤𝑤𝑤ℎ𝑡𝑡 𝐻𝐻𝑖𝑖 is the relative relevance of the ith semantic feature 

In [6]:
def luhn_method(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer_luhn = LuhnSummarizer()
    summary = summarizer_luhn(parser.document, 2)
    summary_sentences = [str(sentence) for sentence in summary]
    final_sentence = ' '.join(summary_sentences)
    return final_sentence

with open('/content/CleanedTextWords.txt', 'r') as file:
    text = file.read()
    result = luhn_method(text)
    print(result)

with open('/content/luhn_method_result.txt', 'w') as file:
    file.write(result)


Daily newspaper has abundant of data that users do not have enough time for reading them It is difficult to identify the relevant information to satisfy the information needed by users Automatic summarization can reduce the problem of information overloading and it has been proposed previously in English and other languages However there were only a few research results in Thai text summarization due to the lack of corpus in Thai language and the complicated grammar Text Summarization 1 is a technique for summarizing the content of the documents It consists of three steps 1 create an intermediate representation of the input text 2 calculate score for the sentences based on the concepts and 3 choose important Permission to make digital or hard copies of all or part of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page Co

In [7]:
def lsa_method(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer_lsa = LsaSummarizer()
    summary = summarizer_lsa(parser.document, 2)
    summary_sentences = [str(sentence) for sentence in summary]
    final_sentence = ' '.join(summary_sentences)
    return final_sentence

with open('/content/CleanedTextWords.txt', 'r') as file:
    text = file.read()
    result = lsa_method(text)
    print(result)

with open('/content/lsa_method_result.txt', 'w') as file:
    file.write(result)


score hence we used Cutkum for this step Table 1 Comparison of Thai word tokenization programs Tools can be found by solving the optimization problem as follows where𝑊𝑊𝑗𝑗𝑗𝑗 0 𝐻𝐻𝑖𝑖𝑖𝑖 0 𝑚𝑚 𝑛𝑛 𝑟𝑟 𝑗𝑗 1 𝑖𝑖 1 𝑙𝑙 1 2 𝑚𝑚𝑚𝑚𝑚𝑚
