In [1]:
!pip install sumy
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.lsa import LsaSummarizer

nltk.download('stopwords')
nltk.download('punkt')

def calculate_word_frequencies(text):
    stopwords_set = set(stopwords.words("english"))
    words = word_tokenize(text)
    freqTable = {}

    for word in words:
        word = word.lower()
        if word in stopwords_set:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable



Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/97.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels fo

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
def calculate_sentence_scores(text, freqTable):
    sentences = sent_tokenize(text)
    sentenceValue = {}

    for sentence in sentences:
        for word, freq in freqTable.items():
            if word in sentence.lower():
                if sentence in sentenceValue:
                    sentenceValue[sentence] += freq
                else:
                    sentenceValue[sentence] = freq

    return sentenceValue

In [3]:
def calculate_average_score(sentenceValue):
    sumValues = sum(sentenceValue.values())
    average = int(sumValues / len(sentenceValue))
    return average

def generate_summary(sentences, sentenceValue, average):
    summary = ''
    for sentence in sentences:
        if sentence in sentenceValue and sentenceValue[sentence] > (1.2 * average):
            summary += sentence
    return summary


In [4]:
def solve(text):
    freqTable = calculate_word_frequencies(text)
    sentenceValue = calculate_sentence_scores(text, freqTable)
    average = calculate_average_score(sentenceValue)
    sentences = sent_tokenize(text)
    summary = generate_summary(sentences, sentenceValue, average)

    return summary

with open('/content/CleanedText.txt', 'r') as file:
    text = file.read()
    result = solve(text)
    print(result)

with open('/content/solve_method_result.txt', 'w') as file:
    file.write(result)





In [5]:
def sumy_method(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, 2)
    summary_sentences = [str(sentence) for sentence in summary]
    final_sentence = ' '.join(summary_sentences)
    return final_sentence

with open('/content/CleanedText.txt', 'r') as file:
    text = file.read()
    result = sumy_method(text)
    print(result)

with open('/content/sumy_method_result.txt', 'w') as file:
    file.write(result)


Extractive Text Summarization for Thai Travel News Based on Keyword Scored in Thai Language Sarunya Nathonghor Duangdao Wichadakul Department of Computer Engineering Chulalongkorn University Bangkok Thailand Department of Computer Engineering Chulalongkorn University Bangkok Thailand Sarunya N Student Chula ac th ABSTRACT In recent years people are seeking for a solution to improve text summarization for Thai language Although several solutions such as PageRank Graph Rank Latent Semantic Analysis LSA models etc have been proposed research results in Thai text summarization were restricted due to limited corpus in Thai language with complex grammar This paper applied a text summarization system for Thai travel news based on keyword scored in Thai language by extracting the most relevant sentences from the original document We compared LSA and Non negative Matrix Factorization NMF to find the algorithm that is suitable with Thai travel news The suitable compression rates for Generic Sent

In [6]:
def luhn_method(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer_luhn = LuhnSummarizer()
    summary = summarizer_luhn(parser.document, 2)
    summary_sentences = [str(sentence) for sentence in summary]
    final_sentence = ' '.join(summary_sentences)
    return final_sentence

with open('/content/CleanedText.txt', 'r') as file:
    text = file.read()
    result = luhn_method(text)
    print(result)

with open('/content/luhn_method_result.txt', 'w') as file:
    file.write(result)


Extractive Text Summarization for Thai Travel News Based on Keyword Scored in Thai Language Sarunya Nathonghor Duangdao Wichadakul Department of Computer Engineering Chulalongkorn University Bangkok Thailand Department of Computer Engineering Chulalongkorn University Bangkok Thailand Sarunya N Student Chula ac th ABSTRACT In recent years people are seeking for a solution to improve text summarization for Thai language Although several solutions such as PageRank Graph Rank Latent Semantic Analysis LSA models etc have been proposed research results in Thai text summarization were restricted due to limited corpus in Thai language with complex grammar This paper applied a text summarization system for Thai travel news based on keyword scored in Thai language by extracting the most relevant sentences from the original document We compared LSA and Non negative Matrix Factorization NMF to find the algorithm that is suitable with Thai travel news The suitable compression rates for Generic Sent

In [7]:
def lsa_method(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer_lsa = LsaSummarizer()
    summary = summarizer_lsa(parser.document, 2)
    summary_sentences = [str(sentence) for sentence in summary]
    final_sentence = ' '.join(summary_sentences)
    return final_sentence

with open('/content/CleanedText.txt', 'r') as file:
    text = file.read()
    result = lsa_method(text)
    print(result)

with open('/content/lsa_method_result.txt', 'w') as file:
    file.write(result)


Extractive Text Summarization for Thai Travel News Based on Keyword Scored in Thai Language Sarunya Nathonghor Duangdao Wichadakul Department of Computer Engineering Chulalongkorn University Bangkok Thailand Department of Computer Engineering Chulalongkorn University Bangkok Thailand Sarunya N Student Chula ac th ABSTRACT In recent years people are seeking for a solution to improve text summarization for Thai language Although several solutions such as PageRank Graph Rank Latent Semantic Analysis LSA models etc have been proposed research results in Thai text summarization were restricted due to limited corpus in Thai language with complex grammar This paper applied a text summarization system for Thai travel news based on keyword scored in Thai language by extracting the most relevant sentences from the original document We compared LSA and Non negative Matrix Factorization NMF to find the algorithm that is suitable with Thai travel news The suitable compression rates for Generic Sent