In [1]:
import nltk
from nltk import tokenize, stem, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Download necessary resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Read text from TXT file
with open('/content/paperThailand.txt', 'r') as file:
    text = file.read()

# Tokenize the text into sentences
sentences = tokenize.sent_tokenize(text)
print('Total sentences in the given text:', len(sentences))
print(sentences)

# Tokenize the text into words
words = tokenize.word_tokenize(text)
print('Total words in the given text:', len(words))
print(words)

# Stemming using PorterStemmer
stemmer = stem.PorterStemmer()
stem_words = [stemmer.stem(word) for word in words]
print('After stemming:', stem_words)

# Lemmatization using WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemma_words = [lemmatizer.lemmatize(word) for word in words]
print('After lemmatization:', lemma_words)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]
print('After removing stopwords:', filtered_words)

# Part-of-speech tagging
pos_tags = pos_tag(words)
print('POS tags:', pos_tags)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Total sentences in the given text: 179
['Extractive Text Summarization for Thai Travel News\nBased on Keyword Scored in Thai Language\nSarunya Nathonghor\n\nDuangdao Wichadakul\n\nDepartment of Computer Engineering\nChulalongkorn University\nBangkok, Thailand\n\nDepartment of Computer Engineering\nChulalongkorn University\nBangkok, Thailand\n\nSarunya.N@Student.Chula.ac.th\nABSTRACT\n\nIn recent years, people are seeking for a solution to improve text\nsummarization for Thai language.', 'Although several solutions such\nas PageRank, Graph Rank, Latent Semantic Analysis (LSA)\nmodels, etc., have been proposed, research results in Thai text\nsummarization were restricted due to limited corpus in Thai\nlanguage with complex grammar.', 'This paper applied a text\nsummarization system for Thai travel news based on keyword\nscored in Thai language by extracting the most relevant sentences\nfrom the original document.', 'We compared LSA and Non-negative\nMatrix Factorization (NMF) to find the

In [2]:
import nltk
from nltk import tokenize, stem, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import heapq

# Download necessary resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Read text from TXT file
with open('/content/paperThailand.txt', 'r') as file:
    text = file.read()

# Preprocess the text
sentences = tokenize.sent_tokenize(text)
words = tokenize.word_tokenize(text)
stemmer = stem.PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_word(word):
    word = word.lower()
    if word not in stop_words:
        word = lemmatizer.lemmatize(word)
        word = stemmer.stem(word)
        return word
    return None

def preprocess_sentence(sentence):
    processed_sentence = []
    for word in tokenize.word_tokenize(sentence):
        processed_word = preprocess_word(word)
        if processed_word:
            processed_sentence.append(processed_word)
    return processed_sentence

preprocessed_sentences = [preprocess_sentence(sentence) for sentence in sentences]

# Text summarization using TF-IDF
word_frequencies = {}
for sentence in preprocessed_sentences:
    for word in sentence:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1

maximum_frequency = max(word_frequencies.values())
for word in word_frequencies.keys():
    word_frequencies[word] = word_frequencies[word] / maximum_frequency

sentence_scores = {}
for i, sentence in enumerate(preprocessed_sentences):
    score = 0
    for word in sentence:
        if word in word_frequencies.keys():
            score += word_frequencies[word]
    sentence_scores[i] = score

summary_sentences = heapq.nlargest(3, sentence_scores, key=sentence_scores.get)
summary = [sentences[i] for i in summary_sentences]

# Print the summary
print("Summary:")
for sentence in summary:
    print(sentence)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Summary:
Thai text summarization efficiency of 5 models
Figure 2 shows the Thai text summarization efficiency of 5 models:
(1) NMF with GRS, (2) NMF with K-means, (3) SVD with sentence
score by Gong, Y. et al., (4) SVD with K-means, and (5) SVD with
sentence score by Murray, G. et al.
From
their experiments, the summarization of the industrial news got
60% precision, 44% recall, and 50.9% F-measure, the general news
got the 51.8% precision, 38.5% recall, and 43.1% F-measure while
the fashion magazines got 53.0% precision, 33.0% recall, and
40.4% F-measure.
S1

3.6

A∙B
=
n
n
||A|| ||B||
�∑i=1
A2i �∑i=1
Bi2

(6)

Mr.Yontas
ak

1

0

0

0

0

0

0

0

0

Supason

1

0

0

0

0

0

0

0

0

Tourism
Authority
of Thailand

1

0

0

0

0

0

0

0

0

…

…

…

…

…

…

…

…

…

…

Table 3 demonstrates an example of a matrix 𝐴𝐴, constructed from
word count by sentence of a Thai travel news.


In [4]:
import nltk
from nltk import tokenize, stem, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import heapq

# Download necessary resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Read text from TXT file
with open('/content/paperThailand.txt', 'r') as file:
    text = file.read()

# Preprocess the text
sentences = tokenize.sent_tokenize(text)
words = tokenize.word_tokenize(text)
stemmer = stem.PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_word(word):
    word = word.lower()
    if word not in stop_words:
        word = lemmatizer.lemmatize(word)
        word = stemmer.stem(word)
        return word
    return None

def preprocess_sentence(sentence):
    processed_sentence = []
    for word in tokenize.word_tokenize(sentence):
        processed_word = preprocess_word(word)
        if processed_word:
            processed_sentence.append(processed_word)
    return processed_sentence

preprocessed_sentences = [preprocess_sentence(sentence) for sentence in sentences]

# Text summarization using TF-IDF
word_frequencies = {}
for sentence in preprocessed_sentences:
    for word in sentence:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1

maximum_frequency = max(word_frequencies.values())
for word in word_frequencies.keys():
    word_frequencies[word] = word_frequencies[word] / maximum_frequency

sentence_scores = {}
for i, sentence in enumerate(preprocessed_sentences):
    score = 0
    for word in sentence:
        if word in word_frequencies.keys():
            score += word_frequencies[word]
    sentence_scores[i] = score

summary_sentences = heapq.nlargest(3, sentence_scores, key=sentence_scores.get)
summary = [sentences[i] for i in summary_sentences]

# Save preprocessed text and summary to a new file
preprocessed_filename = 'preprocessed_text.txt'
summary_filename = 'summary.txt'

# Save preprocessed text
with open(preprocessed_filename, 'w') as file:
    for sentence in preprocessed_sentences:
        file.write(' '.join(sentence))
        file.write('\n')

# Save summary
with open(summary_filename, 'w') as file:
    for sentence in summary:
        file.write(sentence)
        file.write('\n')

print("Preprocessed text saved to:", preprocessed_filename)
print("Summary saved to:", summary_filename)


Preprocessed text saved to: preprocessed_text.txt
Summary saved to: summary.txt


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
