In [20]:
import requests
from bs4 import BeautifulSoup
import pickle

In [2]:
def url_to_transcript(url):
    '''Returns transcript data specifically from scrapsfromtheloft.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="paragraph").find_all('p')]
    print(url)
    return text

In [3]:
url = 'https://focustaiwan.tw/politics/202001110014'
raw_data = url_to_transcript(url)

https://focustaiwan.tw/politics/202001110014


In [4]:
with open("/Users/nouhourouteonsa/Desktop/Tsai_Ying_Wen/" + "speech" + ".txt", "wb") as file:
    pickle.dump(raw_data, file)
    

In [9]:
data = {}
with open("/Users/nouhourouteonsa/Desktop/Tsai_Ying_Wen/" + "speech" + ".txt", "rb") as file:
    data["President_Tsai"] = pickle.load(file)  

In [58]:
data.keys()
data["President_Tsai"][:2]

['Friends from the domestic and international media, thank you for your patience.',
 'To begin, I would like to thank everyone who voted today. Regardless of how you voted, by taking part in this election you have put democratic values into practice. With each presidential election, Taiwan is showing the world how much we cherish our free, democratic way of life, and how much we cherish our nation: the Republic of China (Taiwan).']

In [59]:
next(iter(data.keys()))


'President_Tsai'

In [60]:
def combine_text(text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text =  ' '.join(text)
    return combined_text



In [63]:
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}
#data_combined

In [64]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['speech']
data_df = data_df.sort_index()
data_df

Unnamed: 0,speech
President_Tsai,"Friends from the domestic and international media, thank you for your patience. To begin, I would like to thank everyone who voted today. Regardle..."


In [65]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [66]:
clean_data = pd.DataFrame(data_df.speech.apply(round1))
clean_data.speech

President_Tsai    friends from the domestic and international media thank you for your patience to begin i would like to thank everyone who voted today regardless o...
Name: speech, dtype: object

In [67]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [68]:
clean_data = pd.DataFrame(clean_data.speech.apply(round2))
clean_data

Unnamed: 0,speech
President_Tsai,friends from the domestic and international media thank you for your patience to begin i would like to thank everyone who voted today regardless o...


In [69]:
data_df.to_pickle("corpus_Tsai.pkl")

In [70]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(clean_data.speech)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = clean_data.index
data_dtm

Unnamed: 0,abandon,able,accept,achievements,added,administration,affairs,ahead,answer,areas,...,want,way,willing,winning,witness,words,work,worked,world,years
President_Tsai,1,1,1,1,1,4,1,1,1,1,...,6,2,2,1,1,1,6,1,1,6


In [71]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm_tsai.pkl")

In [72]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
clean_data.to_pickle('clean_data_tsai.pkl')
pickle.dump(cv, open("cv_tsai.pkl", "wb"))