In [1]:
!pip install sumy
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.lsa import LsaSummarizer

nltk.download('stopwords')
nltk.download('punkt')

def calculate_word_frequencies(text):
    stopwords_set = set(stopwords.words("english"))
    words = word_tokenize(text)
    freqTable = {}

    for word in words:
        word = word.lower()
        if word in stopwords_set:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable





[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def calculate_sentence_scores(text, freqTable):
    sentences = sent_tokenize(text)
    sentenceValue = {}

    for sentence in sentences:
        for word, freq in freqTable.items():
            if word in sentence.lower():
                if sentence in sentenceValue:
                    sentenceValue[sentence] += freq
                else:
                    sentenceValue[sentence] = freq

    return sentenceValue

In [3]:
def calculate_average_score(sentenceValue):
    sumValues = sum(sentenceValue.values())
    average = int(sumValues / len(sentenceValue))
    return average

def generate_summary(sentences, sentenceValue, average):
    summary = ''
    for sentence in sentences:
        if sentence in sentenceValue and sentenceValue[sentence] > (1.2 * average):
            summary += sentence
    return summary


In [4]:
def solve(text):
    freqTable = calculate_word_frequencies(text)
    sentenceValue = calculate_sentence_scores(text, freqTable)
    average = calculate_average_score(sentenceValue)
    sentences = sent_tokenize(text)
    summary = generate_summary(sentences, sentenceValue, average)

    return summary

with open('/content/paperThailand.txt', 'r') as file:
    text = file.read()
    result = solve(text)
    print(result)

with open('/content/solve_method_result.txt', 'w') as file:
    file.write(result)


Extractive Text Summarization for Thai Travel News
Based on Keyword Scored in Thai Language
Sarunya Nathonghor

Duangdao Wichadakul

Department of Computer Engineering
Chulalongkorn University
Bangkok, Thailand

Department of Computer Engineering
Chulalongkorn University
Bangkok, Thailand

Sarunya.N@Student.Chula.ac.th
ABSTRACT

In recent years, people are seeking for a solution to improve text
summarization for Thai language.Although several solutions such
as PageRank, Graph Rank, Latent Semantic Analysis (LSA)
models, etc., have been proposed, research results in Thai text
summarization were restricted due to limited corpus in Thai
language with complex grammar.This paper applied a text
summarization system for Thai travel news based on keyword
scored in Thai language by extracting the most relevant sentences
from the original document.From these experiments, we concluded that keyword
scored calculation by LSA with sentence selection by GRS is the
best algorithm for summarizing Thai 

In [5]:
def sumy_method(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, 2)
    summary_sentences = [str(sentence) for sentence in summary]
    final_sentence = ' '.join(summary_sentences)
    return final_sentence

with open('/content/paperThailand.txt', 'r') as file:
    text = file.read()
    result = sumy_method(text)
    print(result)

with open('/content/sumy_method_result.txt', 'w') as file:
    file.write(result)


In this paper, we applied LSA and NMF on the Thai Travel News dataset for calculating the semantic weights, which represented the relationship between sentences and words in order to select the representative sentences for summarization. Thai text summarization efficiency of 5 models Figure 2 shows the Thai text summarization efficiency of 5 models: (1) NMF with GRS, (2) NMF with K-means, (3) SVD with sentence score by Gong, Y. et al., (4) SVD with K-means, and (5) SVD with sentence score by Murray, G. et al. applied to 400 Thai travel news, divided into 5 sets of 80 news each, with the varied compression rates of 20%, 30% and 40%.


In [6]:
def luhn_method(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer_luhn = LuhnSummarizer()
    summary = summarizer_luhn(parser.document, 2)
    summary_sentences = [str(sentence) for sentence in summary]
    final_sentence = ' '.join(summary_sentences)
    return final_sentence

with open('/content/paperThailand.txt', 'r') as file:
    text = file.read()
    result = luhn_method(text)
    print(result)

with open('/content/luhn_method_result.txt', 'w') as file:
    file.write(result)


The original matrix A can be separated into three matrices, where U is the m x r (words x extracted concept) matrix, V is the n x r (sentences x extracted concepts) matrix, and Σ is the r x r diagonal matrix, which can be reconstructed to find the original matrix A. Thai text summarization efficiency of 5 models Figure 2 shows the Thai text summarization efficiency of 5 models: (1) NMF with GRS, (2) NMF with K-means, (3) SVD with sentence score by Gong, Y. et al., (4) SVD with K-means, and (5) SVD with sentence score by Murray, G. et al. applied to 400 Thai travel news, divided into 5 sets of 80 news each, with the varied compression rates of 20%, 30% and 40%.


In [7]:
def lsa_method(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer_lsa = LsaSummarizer()
    summary = summarizer_lsa(parser.document, 2)
    summary_sentences = [str(sentence) for sentence in summary]
    final_sentence = ' '.join(summary_sentences)
    return final_sentence

with open('/content/paperThailand.txt', 'r') as file:
    text = file.read()
    result = lsa_method(text)
    print(result)

with open('/content/lsa_method_result.txt', 'w') as file:
    file.write(result)


Therefore, we collected 400 Thai travel news from Thairath and Manager online newspapers to be used as datasets for our experiments. From this experiment, the best model based on keyword score for Thai travel news summarization was SVD with sentence selection by Murray, G. et al.
