In [36]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

In [5]:
headers = {
    'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1 RuxitSynthetic/1.0 v1355348020 t3296535826494656701 smf=0'
}

home_link = 'https://pcoo.gov.ph/presidential-speech/'

response = requests.get(home_link, headers=headers)
response.status_code

200

# Corpus Making

# 1. Speeches Information Extraction

>Traversing body through container

In [6]:
bf = BeautifulSoup(response.text, 'lxml')
body = bf.find('div', class_='col-xs-12 col-sm-12 nopadding-left')

In [47]:
speeches = {}

num_page = 21

#we're going to use this header cause the default python one is blocked by the site
headers = {
    'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1 RuxitSynthetic/1.0 v1355348020 t3296535826494656701 smf=0'
}

#traverse through diffent pages in the site from 1 to num_page
for i in range(num_page):
    #structure of the url(just changing the page number)
    lnk = f'https://pcoo.gov.ph/presidential-speech/page/{i + 1}/' 
    
    #we're requesting the link
    response = requests.get(lnk, headers=headers)
    
    #check if the response is 'ok'
    if response.status_code == 200:
        
        #parsing the html using lxml parser
        html = BeautifulSoup(response.text, 'lxml')
        
        #retrieving just the container of the speeches
        body = html.find('div', class_='col-xs-12 col-sm-12 nopadding-left')
        
        #traversing through speeches within container and extracting title, date, and url of the speech
        for speech in body.findAll('div', class_='focus-feature row'):
            title = speech.h3.a.text.strip()
            date = speech.small.time.text.strip()
            url = speech.h3.a['href']
            speeches[date] = {'title': title, 'url': url}

## 1.1 Dataframe Transformation

In [4]:
df = pd.read_csv('Duterte\'s Speeches.csv', index_col='Unnamed: 0')

In [48]:
df = pd.DataFrame(speeches).T

In [6]:
df

Unnamed: 0,title,url
"May 26, 2020",Excerpts from Speech of President Rodrigo Roa ...,https://pcoo.gov.ph/presidential-speech/excerp...
"May 25, 2020",Talk to the People of President Rodrigo Roa Du...,https://pcoo.gov.ph/presidential-speech/talk-t...
"May 22, 2020",Speech of President Rodrigo Roa Duterte during...,https://pcoo.gov.ph/presidential-speech/speech...
"May 19, 2020",Talk to the People of President Rodrigo Roa Du...,https://pcoo.gov.ph/presidential-speech/talk-t...
"May 12, 2020",Talk to the People of President Rodrigo Roa Du...,https://pcoo.gov.ph/presidential-speech/talk-t...
...,...,...
"April 5, 2018",Speech of President Rodrigo Roa Duterte during...,https://pcoo.gov.ph/presidential-speech/speech...
"April 3, 2018",Speech of President Rodrigo Roa Duterte during...,https://pcoo.gov.ph/presidential-speech/speech...
"April 2, 2018",Speech of President Rodrigo Roa Duterte during...,https://pcoo.gov.ph/presidential-speech/speech...
"March 26, 2018",Speech Of President Rodrigo Roa Duterte during...,https://pcoo.gov.ph/presidential-speech/speech...


# 2. Read/Download all speeches
>In this stage we are going to extract the transcript of the speech in each page. However some page do not have the transcipt but instead provide a pdf file that has it. So we're going to crawl each page and scrape those that has the transcript or download the pdf for those hasn't.  In next stage we're going to process those pdfs.

>We are also going to extract the event and locaion of the speech to add in our dataset.

In [159]:
def extract_transcript(html, transcript):
    """For extracting the transcript in the page"""
    lt_name = ['PRESIDENT RODRIGO ROA DUTERTE:\xa0', 'PRESIDENT DUTERTE:\xa0', 
               'PRESIDENT DUTERTE:', 'PRESIDENT RODRIGO ROA DUTERTE:']
    #the whole transcript is contained in div with release-content class
    body = html.find('div', class_='release-content')
            
    #traverse through body and extract the content of p(except line break)
    lt_p = [p for p in body if p != '\n']

    #discard paragraphs that are not dialogue of President Duterte
    isDuterte = True
    dut_text_lt = []
    for p in lt_p:
        if p.strong:
            if any(nm in p.strong.text for nm in lt_name):
                isDuterte = True
            else:
                isDuterte= False

        if isDuterte:
            dut_text_lt.append(p.text)

    #concatenate every word into a single string
    single_str = ', '.join(dut_text_lt)
    #append the transcript as single string
    transcript.append(single_str)
    

def download_pdf(html, transcript, date, headers, cur_lnk):
    """For downloading the pdf file from the page"""
    
    #the download link is contained in ul tag with class download-media
    ul = html.find('ul', class_='download-media')
    
    #within ul there are two list either the pdf or the audio
    #sometimes there is no audio thus try-catch
    li = ul.findAll('li')
    try:
        pdf_lnk = li[1].a['href']
    except:
        pdf_lnk = li[0].a['href']
        
    try:
        #we're going to name the pdf file using date of speech
        with open(f'duterte_pdf\\{date}.pdf', 'wb') as fl:

            #sometimes instead of pdf link they will give you an extension url of a page that contains the pdf url
            #so check if the last 4 characters of the link ends with .pdf
            if pdf_lnk[-4:] =='.pdf':
                response2 = requests.get(pdf_lnk, headers=headers)
                fl.write(response2.content)
            else:
                #we have to parse again this another page and location the pdf link and download it
                concat_lnk = cur_lnk + pdf_lnk.lower()
                response2 = requests.get(concat_lnk, headers=headers)
                html_temp = BeautifulSoup(response2.text, 'lxml')

                #in this page the link is in a tag of a p tag with class attachment
                pdf_lnk2 = html_temp.find('p', class_='attachment').a['href']

                response3 = requests.get(pdf_lnk2, headers=headers)
                fl.write(response3.content)
    except:
        print(date)
        
    #for now we're going to represent those transcripts in pdf as nan
    transcript.append(np.nan)

In [111]:
event = []
location = []
transcript = []

#traverse through the speeches
for i in range(df.shape[0]):
    
    #passing url in dataframe and headers as parameters
    response = requests.get(df.iloc[i].url, headers=headers)
    date = df.iloc[i].name
    url = df.iloc[i].url
    if response.status_code == 200:
        html = BeautifulSoup(response.text, 'lxml')
        
        #scraping the event and location(some do not have this info, thus try-except)
        speech_info = html.findAll('span', class_='speech-specifics')
        try:
            event.append(speech_info[0].text)
        except:
            event.append(np.nan)
        try: 
            location.append(speech_info[1].text)
        except:
            location.append(np.nan)
            
        #the trancript is stored within div tag with class release-content
        rel_cont = html.find('div', class_='release-content')
        
        #check if it contains p tag(meaning the page contains the transcript)
        if rel_cont.p:
            extract_transcript(html, transcript)
        else:
            download_pdf(html, transcript, date, headers, url)

May 11, 2019
May 8, 2019


>To identify if a page contain the transcript or just the pdf, we have to find a div tag with 'release-content' class. Then we're going to check if this contain p tag, if it does then this page has the transcript otherwise just the pdf. If it contains the transcript then we're going to extract only President Duerte's part (some contains dialogue of other people). Now if it just contain the pdf then we're going to download the pdf and further process it in next section. In some cases the pdf url(href in a tag) is just an extension for the current url, that will redirect you to a page that contains the pdf url. We will further process this data in following sections.

>Now that we have extracted/downloaded the transcript let's added it into out dataframe.

## 2.1 Add to DataFrame

In [164]:
df['event'] = event
df['location'] = location
df['transcript'] = transcript

df

Unnamed: 0,title,url,event,location,transcript
"May 26, 2020",Excerpts from Speech of President Rodrigo Roa ...,https://pcoo.gov.ph/presidential-speech/excerp...,Meeting with Philippine Army (PA) and Philippi...,"Malago Clubhouse, Malacañang Park, Manila",
"May 25, 2020",Talk to the People of President Rodrigo Roa Du...,https://pcoo.gov.ph/presidential-speech/talk-t...,On Coronavirus Disease 2019 (COVID-19),Malago Clubhouse in Malacañang,PRESIDENT RODRIGO ROA DUTERTE: I remember dist...
"May 22, 2020",Speech of President Rodrigo Roa Duterte during...,https://pcoo.gov.ph/presidential-speech/speech...,Commencement Exercsies of the Philippine Milit...,"Malago Clubhouse, Malacañang Park, Manila","Kindly sit down. [May upuan sila? Okay.], Defe..."
"May 19, 2020",Talk to the People of President Rodrigo Roa Du...,https://pcoo.gov.ph/presidential-speech/talk-t...,On Coronavirus Disease 2019 (COVID-19),Malago Clubhouse in Malacañang,PRESIDENT RODRIGO ROA DUTERTE: Good evening my...
"May 12, 2020",Talk to the People of President Rodrigo Roa Du...,https://pcoo.gov.ph/presidential-speech/talk-t...,On Coronavirus Disease 2019 (COVID-19),Malago Clubhouse in Malacañang,"PRESIDENT DUTERTE: Sir, one question. Itong op..."
...,...,...,...,...,...
"April 5, 2018",Speech of President Rodrigo Roa Duterte during...,https://pcoo.gov.ph/presidential-speech/speech...,"Awarding of Outstanding Farmers, Fisherfolks a...","Rizal Hall, Malacañan Palace",
"April 3, 2018",Speech of President Rodrigo Roa Duterte during...,https://pcoo.gov.ph/presidential-speech/speech...,Inauguration of Lisap Bridge Project,"Bongabong, Oriental Mindoro",
"April 2, 2018",Speech of President Rodrigo Roa Duterte during...,https://pcoo.gov.ph/presidential-speech/speech...,Distribution of Certificate of Land Ownership ...,"Provincial Capitol Gymnasium, Isulan, Sultan K...",
"March 26, 2018",Speech Of President Rodrigo Roa Duterte during...,https://pcoo.gov.ph/presidential-speech/speech...,Handover of loose firearms and presentation of...,"Multi-purpose Gym, Capitol, Patikul Sulu",


>As expected there are nan values in our transcript and event. The next thing we have to do now is to extract the transcript from the pdf file. In this time of writing the most recent speech in the website is on March 26, 2020 and the oldest one is on March 23, 2018.

# 3. Process transcipt in PDF file
>To process the PDF file we'll use the pdfminer and python-docx module. Since the content of PDF does not have layout and just pure text, I have to find pattern to extract President Duterte's dialogue . It starts with the header and title and in most PDF it follows a particular structure. The main body of the speech is right after a particular pattern which is the Location and Date of the speech  enclosed by brackets like [Malago Clubhouse, Malacañang Park, Manila | 26 May 2020] then followed by the transcript(there are 3 PDFs that doesn't have this pattern). So I used regular expression to find the first occurence of this pattern and extract everything after it.

>In my obervation there are 2 types of transcript. The first one is when it only contains President Duterte's dialogue second is when there are dialogue from other people(basically a conversation).  So I have extract only President Duterte's dialogue. For this type of transcript I have to convert the PDF file to word document for me to determine which are President Duterte's dialogue. 

>This section is divided into two:
 1. Extract transcripts only(discard heading, date, and so on...)
 2. Extract Duterte's Dialogue only(transcripts that contains dialogue of multiple people)

In [None]:
import pdfminer 
from pdfminer.high_level import extract_text
import os

>Extract text from PDFs

In [4]:
pdf_txt_dict = {}
for fl_name in os.listdir('duterte_pdf'):
    try:
        pdf_txt_dict[fl_name] = extract_text(f'duterte_pdf\\{fl_name}')
    except:
        print(fl_name)


April 25, 2018.pdf


>In here I found out that the PDF I downloaded for April 25, 2018 fail to load and so I checked the url for this PDF. It does not work also. So we have to exclude this from our dataset.

## 3.1 Extract Transcript only

In [3]:
import re
regex = re.compile('\\[(.*?)\]')
count= 0
pdf_txt_dict_trimmed = {}
for key in pdf_txt_dict.keys():
    try:
        span = re.search(regex, pdf_txt_dict[key].replace('\n', '')).span()
        pdf_txt_dict_trimmed[key] = pdf_txt_dict[key].replace('\n', '')[span[1]:]
    except:
        pdf_txt_dict_trimmed[key] = pdf_txt_dict[key].replace('\n', '')
        print(key)

June 22, 2019.pdf
May 21, 2019.pdf
September 4, 2018.pdf


>So these are the PDFs that doesn't have those pattern. I checked them manually and found out that the PDF in June 22, 2019 does not contain dialogue of President Duterte but just the summary of what orccur on that event so I have to remove this. So far we have to remove April 25, 2018 and June 22, 2019. As for the other two they don't have the pattern so I have to extract them manually in the following section(I copied ot to a .txt file so you can just read it from there).

## 3.2 Extract only President Duterte's Dialogue
>Only a few PDF contains dialogue of another person other than President Duterte, so I have to identify these PDFs. Unlike an html where we can identify if there is a conversation by searching for strong tag and then a name. In PDF there is no such thing, so for us to know if there is a conversation is if it contains one of these strings 'PRESIDENT RODRIGO DUTERTE:', 'PRESIDENT DUTERTE:', 'PRESIDENT RODRIGO ROA DUTERTE:'(include the colon) after trimming it.

>We're going to traverse through the transcripts and identify if it contains one of these strings.

In [2]:
lt_names= ['PRESIDENT RODRIGO DUTERTE:', 'PRESIDENT DUTERTE:', 'PRESIDENT RODRIGO ROA DUTERTE:']

lt_docx = []
for key in pdf_txt_dict_trimmed.keys():
    if any(name in pdf_txt_dict_trimmed[key] for name in lt):
        lt_docx.append(key)     
print(lt_docx)

['April 13, 2018.pdf', 'April 13, 2020.pdf', 'April 29, 2018.pdf', 'April 8, 2020.pdf', 'April 9, 2018.pdf', 'August 20, 2019.pdf', 'August 30, 2019.pdf', 'December 10, 2019.pdf', 'December 26, 2018.pdf', 'December 4, 2019.pdf', 'December 5, 2019.pdf', 'July 28, 2019.pdf', 'June 2, 2018.pdf', 'June 8, 2019.pdf', 'March 7, 2019.pdf', 'May 21, 2019.pdf', 'May 28, 2020.pdf', 'November 1, 2018.pdf', 'November 15, 2018.pdf', 'November 20, 2018.pdf', 'November 25, 2019.pdf', 'November 6, 2018.pdf', 'October 1, 2019.pdf', 'October 18, 2019.pdf', 'October 2, 2019.pdf', 'October 3, 2019.pdf', 'October 31, 2019.pdf', 'October 6, 2019.pdf', 'September 11, 2018.pdf', 'September 13, 2018.pdf', 'September 16, 2018.pdf', 'September 17, 2018.pdf', 'September 18, 2018.pdf', 'September 2, 2018.pdf']


>So these are the PDFs that has dialogue other than Presiden Duterte. I have to convert these manually to docx file and store them to separate directory(you can just load the from duterte_docx file).

>Now to extract Duterte's Dialogue from these docx.

In [None]:
import docx

transcript_docx = {}

#traverse through different pdf
for fl in lt_docx:
    
    doc = docx.Document(f'duterte_docx\\{fl[:-4]}.docx')
    encountered = False
    isDuterte = True
    trans = []
    
    #tranverse through the context of transcript
    for paragraph in doc.paragraphs:
        if encountered:
            for run in paragraph.runs:
                # if the text is bold we ask if it is one of the lt_names
                if run.bold:
                    if any(nm in paragraph.text for nm in lt_names):
                        isDuterte = True
                    else:
                        isDuterte = False
            
            #we'll only include President Duterte's dialogue in the list
            if isDuterte:
                if paragraph.text:
                    trans.append(paragraph.text)


        #if we encounter the pattern we'll start extracting
        if re.search(regex, paragraph.text):
            encountered = True
            
    
    single_str = ' '.join(trans)
    
    for nm in lt_names:
        single_str = single_str.replace(nm, ' ')
        
    transcript_docx[fl] = single_str 

>Now that we've extracted the dialogue of President Duterte from those docx. Let's delete May 21, 2019 and September 4, 2018 in both dictionaries because they are those that do not contain the pattern. In the dictionary they are either empty or contains dialogue from another person. We'll add the actual dialogue of President Duterte in this date later.

>We'll merge the two dictionaries.

In [5]:
del pdf_txt_dict_trimmed['May 21, 2019.pdf']
del transcript_docx['May 21, 2019.pdf']
del pdf_txt_dict_trimmed['September 4, 2018.pdf']
transcript_from_pdf = pdf_txt_dict_trimmed
transcript_from_pdf.update(pdf_txt_dict_trimmed)
list(transcript_from_pdf.keys())[:10]

['April 10, 2018.pdf', 'April 11, 2019.pdf', 'April 12, 2018.pdf', 'April 13, 2018.pdf', 'April 13, 2019.pdf', 'April 13, 2020.pdf', 'April 15, 2018.pdf', 'April 16, 2019.pdf', 'April 16, 2020.pdf', 'April 17, 2018.pdf']


>So these are the first 10 keys in our dictionary let's remove the extension .pdf and store it to a variable *all_transcript_from_pdf*.

In [6]:
all_transcript_from_pdf = {}
for key, val in zip(transcript_from_pdf.keys(), transcript_from_pdf.values()):
    all_transcript_from_pdf[key[:-4]] = val

['April 10, 2018', 'April 11, 2019', 'April 12, 2018', 'April 13, 2018', 'April 13, 2019', 'April 13, 2020', 'April 15, 2018', 'April 16, 2019', 'April 16, 2020', 'April 17, 2018']


>Now let's add the actualy content of May 21, 2019 and September 4, 2018.

In [None]:
with open('duterte_pdf\\May 21, 2019.txt', 'r') as fl:
    my_21_19 = fl.read()

with open('duterte_pdf\\September 4, 2018.txt', 'r') as fl:
    sp_04_18 = fl.read()

all_transcript_from_pdf['May 21, 2019'] = my_21_19.replace('\n', ' ')
all_transcript_from_pdf['September 4, 2018'] = sp_04_18.replace('\n', ' ')

>Let's add the dictionary to our data frame/corpus

In [None]:
from_pdf_ser = pd.Series(all_transcript_from_pdf)
df = pd.merge(df.reset_index(), from_pdf_ser.to_frame().T.reset_index(), how='outer').set_index('index')

>So far we have one transcript that does not work which is the speech on April 25, 2018(the pdf url does not work), and we also have a datat that is not a transcript but the summary which is on June 22, 2019. So we'll remove this from our dataframe.

In [None]:
df = df.drop(index=['June 22, 2019', 'April 25, 2018'])

>Now let's save the the corpus to a file

In [None]:
df.to_csv('Duterte_Speech.csv')