# Workshop: Extractive Text Summarization - viral news online

## 0. Preface & Load Packages

In [1]:
# !pip install beautifulsoup4
# !pip install requests
import requests
from bs4 import BeautifulSoup

# !pip install nltk
# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

## 1. Word Frequency Table

In [2]:
def _create_frequency_table(text_string) -> dict:

    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable

## 2. Tokenize Sentences

In [3]:
# sent_tokenize(text_string)

## 3. Score the Sentences - Term Frequency

In [4]:
def _score_sentences(sentences, freqTable) -> dict:
    sentenceValue = dict()

    for sentence in sentences:
        word_count_in_sentence = (len(word_tokenize(sentence)))
        for wordValue in freqTable:
            if wordValue in sentence.lower():
                if sentence[:10] in sentenceValue:
                    sentenceValue[sentence[:10]] += freqTable[wordValue]
                else:
                    sentenceValue[sentence[:10]] = freqTable[wordValue]

        sentenceValue[sentence[:10]] = sentenceValue[sentence[:10]] // word_count_in_sentence

    return sentenceValue

## 4. Create a Threshold using Average Score

In [5]:
def _find_average_score(sentenceValue) -> int:
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original text
    average = int(sumValues / len(sentenceValue))

    return average

## 5. Generate Summary

In [6]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:10] in sentenceValue and sentenceValue[sentence[:10]] > (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

## 6. Comparing the Original Text versus Summary

In [7]:
# you may use any list, just check that webpage can be read by 'requests'
url_list = ['https://www.theonlinecitizen.com/2021/01/15/transgender-student-shares-distressing-experience-of-moe-blocking-her-hrt-treatment/',
'https://www.theonlinecitizen.com/2021/01/27/sporean-youth-detained-under-isa-over-alleged-plans-to-attack-muslims-at-two-mosques-netizens-call-for-rehabilitation-instead-of-retributive-punishment/',
'https://www.theonlinecitizen.com/2021/02/09/netizens-suggest-students-should-be-afforded-choice-to-opt-out-from-high-element-activities-in-school-camps/']

for url in url_list:
    # 0 read each url
    news = requests.get(url)
    soup = BeautifulSoup(news.text, 'lxml')
    
    # 0 get title
    title = soup.title
    
    # 0 get text
    text = ''
    for i in soup.find_all('p'):
        text += i.get_text() + '\n'
    
    # 1 Create the word frequency table
    freq_table = _create_frequency_table(text)

    # 2 Tokenize the sentences
    sentences = sent_tokenize(text)

    # 3 Important Algorithm: score the sentences
    sentence_scores = _score_sentences(sentences, freq_table)

    # 4 Find the threshold
    threshold = _find_average_score(sentence_scores)

    # 5 Important Algorithm: Generate the summary
    summary = _generate_summary(sentences, sentence_scores, 1.5 * threshold) # user should vary the value '1.5'
    
    print('<<Title>>')
    print(title.get_text() + '\n')
    print('<<Original text>>')
    print(text + '\n')
    print('<<Summary>>')
    print(summary + '\n')

<<Title>>
Transgender student shares distressing experience of MOE allegedly blocking her HRT treatment - The Online Citizen Asia

<<Original text>>
Using a throwaway account on reddit/r/SGExams, one transgender student recalled her difficult experience in school, specifically with the Ministry of Education (MOE) allegedly interfering her hormone replacement therapy (HRT) treatment that was prescribed by her doctor after she was diagnosed with gender dysphoria.
The student recalled her experience in an all-boys-primary school, describing it as “the worse period of my life”.
She wrote, “I couldn’t fit in and constantly got bullied because I was ‘too soft’ and ‘needed to man up to the bullies’.
Things started looking up when she started attending a co-ed secondary school where she made more friends and started better understand her identity.
It was then that she was taken to the gender clinic at the Institute of Mental Health (IMH). There, she was diagnosed with gender dysphoria.
When sh

#### Comments: Text summarization is a cool technique for quick reading. Do compare with original posts:

   [Transgender student shares distressing experience of MOE allegedly blocking her HRT treatment](https://www.theonlinecitizen.com/2021/01/15/transgender-student-shares-distressing-experience-of-moe-blocking-her-hrt-treatment/)
    
   [S’porean youth detained under ISA over alleged plans to attack Muslims at two mosques; netizens call for rehabilitation instead of retributive punishment](https://www.theonlinecitizen.com/2021/01/27/sporean-youth-detained-under-isa-over-alleged-plans-to-attack-muslims-at-two-mosques-netizens-call-for-rehabilitation-instead-of-retributive-punishment/)

   [Netizens suggest students should be afforded choice to opt out from high-element activities in school camps](https://www.theonlinecitizen.com/2021/02/09/netizens-suggest-students-should-be-afforded-choice-to-opt-out-from-high-element-activities-in-school-camps/)
   
#### In part 6, user should experiment with different values x threshold. (1.5 was used for this example)

#### Special thanks to Akash Panchal on his post.
   [NLP — Text summarization in 5 steps using NLTK: WordFrequency Algorithm](https://becominghuman.ai/text-summarization-in-5-steps-using-nltk-65b21e352b6)  