## Analyse all house of commons speeches by current MPs

In [1]:
import spacy
import pandas as pd
from bs4 import BeautifulSoup
from collections import Counter
import requests

# Store TheyWorkForYou API key in separate config file
from config import TWFY_API_KEY

In [2]:
# Get MP -> theyworkforyou_id map
# Probably need to modify after general election to point to previous batch of MPs
mps = pd.read_csv("https://www.theyworkforyou.com/mps/?f=csv")

In [3]:
def get_mp_speeches(mp_id):
    """Get speeches of a particular MP based on TheyWorkForYou id and convert data into long format pandas data frame.
    Each row represents one speech at a particular date and time"""
    all_speeches = pd.DataFrame()
    rows = [1]
    page_no=1
    while len(rows) > 0:
        t = requests.get("https://www.theyworkforyou.com/api/getDebates?key={api_key}&\
                     type=commons&person={person}&results_per_page=1000&num={num}&page={page}&output=js".format(api_key=TWFY_API_KEY,
                                                                                                               person=mp_id,
                                                                                                               num=1000,
                                                                                                               page=page_no))
        rows = t.json()["rows"]
        speeches = []
        # Loop over each row
        for row in rows:
            speeches.append({
                    'speech_id':row["gid"],
                    'speech_url':row["listurl"],
                    'mp_name':row["speaker"]["name"],
                    'mp_constituency':row["speaker"]["constituency"],
                    'mp_party':row["speaker"]["party"],
                    'mp_id':row["person_id"],
                    'date':pd.to_datetime(row["hdate"], format="%Y-%m-%d"),
                    'time':row["htime"],
                    'section_id':row["section_id"],
                    'subsection_id':row["subsection_id"],
                    'debate_title':row["parent"]["body"],
                    'body':BeautifulSoup(row["body"], "html5lib").get_text()
                })
        speeches = pd.DataFrame(speeches)

        # Concatenate onto complete speeches dataframe
        all_speeches = pd.concat([all_speeches, speeches], ignore_index=True)
        # Increment page_counter
        page_no += 1
    
    print("Got speeches for MP {0}".format(mp_id))
    return all_speeches

In [4]:
## Download all MP speeches if this is set to True
if False:
    # Parallelise downloading of MP speeches
    from multiprocessing import Pool

    # Number of threads to use to fetch
    NUM_THREADS = 8
    # Make list of mp ids
    list_of_mp_ids = list(mps["Person ID"])

    # Create pool of threads
    pool = Pool(NUM_THREADS)
    # Use pool.map to download speeches mp by mp
    results = pool.map(get_mp_speeches, list_of_mp_ids)
    pool.close()
    pool.join()

    # Concatenate all mps into one dataframe
    all_mp_speeches = pd.concat(results)
    
    # Write data to a file to save it
    all_mp_speeches.to_hdf("mp_speeches.h5", "speeches")
else:
    ## Read in mp speeches that have been previously downloaded
    all_mp_speeches = pd.read_hdf("mp_speeches.h5", "speeches")

In [5]:
all_mp_speeches["time_"] = pd.to_datetime(all_mp_speeches.time, format="%H:%M:%S", errors="coerce")

In [6]:
all_mp_speeches["time_hour"] = all_mp_speeches.time.str.split(":", expand=True).get(0)

In [7]:
import plotly.plotly as py
import plotly.graph_objs as go
import cufflinks as cf

### Run data through NLP

In [8]:
import pyLDAvis
import pyLDAvis.gensim

In [9]:
# Load english language model from spacy
nlp = spacy.load("en")


Interpreting naive datetime as local 2017-05-04 15:28:21.353920. Please add timezone info to timestamps.



In [34]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
import itertools
import codecs
import os


Interpreting naive datetime as local 2017-05-04 15:41:21.487053. Please add timezone info to timestamps.



In [42]:
# Directory to store Phrase models
intermediate_directory = os.path.join('.', 'intermediate')


Interpreting naive datetime as local 2017-05-04 15:47:09.640687. Please add timezone info to timestamps.



In [11]:
all_mp_speeches_sample = all_mp_speeches.sample(frac=0.1)


Interpreting naive datetime as local 2017-05-04 15:28:29.354134. Please add timezone info to timestamps.



In [12]:
del all_mp_speeches


Interpreting naive datetime as local 2017-05-04 15:28:29.490591. Please add timezone info to timestamps.



In [39]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')

def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])


Interpreting naive datetime as local 2017-05-04 15:44:47.180140. Please add timezone info to timestamps.



In [72]:
#small_sample=all_mp_speeches_sample.sample(frac=0.1)


Interpreting naive datetime as local 2017-05-04 16:01:33.927268. Please add timezone info to timestamps.



In [134]:
# Save speeches to txt file first
speeches_filepath = os.path.join(intermediate_directory, "speeches.txt")
if True:
    with codecs.open(speeches_filepath, "w", encoding="utf_8") as f:
        for speech in all_mp_speeches_sample["body"]:
            f.write(speech + "\n")


Interpreting naive datetime as local 2017-05-04 16:25:32.293472. Please add timezone info to timestamps.



In [135]:
unigram_sentences_filepath = os.path.join(intermediate_directory, 'unigram_sentences_all.txt')


Interpreting naive datetime as local 2017-05-04 16:25:32.295095. Please add timezone info to timestamps.



In [136]:
%%time
## Lemmatize all speeches and store them in text file
# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if True:
    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(speeches_filepath):
            f.write(sentence + '\n')


Interpreting naive datetime as local 2017-05-04 16:25:32.296659. Please add timezone info to timestamps.



CPU times: user 14min 21s, sys: 11.2 s, total: 14min 32s
Wall time: 6min 43s


In [137]:
unigram_sentences = LineSentence(unigram_sentences_filepath)


Interpreting naive datetime as local 2017-05-04 16:25:32.298083. Please add timezone info to timestamps.



In [138]:
bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all')


Interpreting naive datetime as local 2017-05-04 16:25:32.299430. Please add timezone info to timestamps.



In [139]:
%%time

## Convert unigrams to bigrams
# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if True:

    bigram_model = Phrases(unigram_sentences)

    bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)


Interpreting naive datetime as local 2017-05-04 16:25:32.301933. Please add timezone info to timestamps.



CPU times: user 29.7 s, sys: 208 ms, total: 30 s
Wall time: 31.7 s


In [140]:
bigram_sentences_filepath = os.path.join(intermediate_directory, 'bigram_sentences_all.txt')


Interpreting naive datetime as local 2017-05-04 16:25:32.302600. Please add timezone info to timestamps.



In [141]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if True:
    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f: 
        for unigram_sentence in unigram_sentences:
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            f.write(bigram_sentence + '\n')


Interpreting naive datetime as local 2017-05-04 16:25:32.303915. Please add timezone info to timestamps.


For a faster implementation, use the gensim.models.phrases.Phraser class



CPU times: user 49.1 s, sys: 128 ms, total: 49.2 s
Wall time: 49.3 s


In [142]:
bigram_sentences = LineSentence(bigram_sentences_filepath)


Interpreting naive datetime as local 2017-05-04 16:25:32.305201. Please add timezone info to timestamps.



In [143]:
trigram_model_filepath = os.path.join(intermediate_directory, 'trigram_model_all')


Interpreting naive datetime as local 2017-05-04 16:25:32.306532. Please add timezone info to timestamps.



In [144]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if True:

    trigram_model = Phrases(bigram_sentences)

    trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)


Interpreting naive datetime as local 2017-05-04 16:25:32.307914. Please add timezone info to timestamps.



CPU times: user 22.6 s, sys: 232 ms, total: 22.9 s
Wall time: 22.7 s


In [145]:
trigram_sentences_filepath = os.path.join(intermediate_directory, 'trigram_sentences_all.txt')


Interpreting naive datetime as local 2017-05-04 16:25:32.310267. Please add timezone info to timestamps.



In [146]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if True:
    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for bigram_sentence in bigram_sentences:
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            f.write(trigram_sentence + '\n')


Interpreting naive datetime as local 2017-05-04 16:25:32.311820. Please add timezone info to timestamps.


For a faster implementation, use the gensim.models.phrases.Phraser class



CPU times: user 57 s, sys: 308 ms, total: 57.3 s
Wall time: 57.3 s


In [147]:
trigram_sentences = LineSentence(trigram_sentences_filepath)


Interpreting naive datetime as local 2017-05-04 16:25:32.313263. Please add timezone info to timestamps.



In [148]:
trigram_speeches_filepath = os.path.join(intermediate_directory, 'trigram_transformed_speeches_all.txt')


Interpreting naive datetime as local 2017-05-04 16:25:32.314820. Please add timezone info to timestamps.



In [149]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if True:
    with codecs.open(trigram_speeches_filepath, 'w', encoding='utf_8') as f:  
        for parsed_speech in nlp.pipe(line_review(speeches_filepath),
                                      batch_size=10000, n_threads=4):
            # lemmatize the text, removing punctuation and whitespace
            unigram_speech = [token.lemma_ for token in parsed_speech
                              if not punct_space(token)]
            
            # apply the first-order and second-order phrase models
            bigram_speech = bigram_model[unigram_speech]
            trigram_speech = trigram_model[bigram_speech]
            
            # remove any remaining stopwords
            trigram_speech = [term for term in trigram_speech
                              if term not in spacy.en.language_data.STOP_WORDS]
            
            # write the transformed review as a line in the new file
            trigram_speech = u' '.join(trigram_speech)
            f.write(trigram_speech + '\n')


Interpreting naive datetime as local 2017-05-04 16:25:32.316440. Please add timezone info to timestamps.


For a faster implementation, use the gensim.models.phrases.Phraser class



CPU times: user 16min 10s, sys: 10.9 s, total: 16min 21s
Wall time: 9min 11s


In [150]:
print(u'Original:')

for review in itertools.islice(line_review(speeches_filepath), 30, 33):
    print(review)

print(u'Transformed:')

with codecs.open(trigram_speeches_filepath, encoding='utf_8') as f:
    for review in itertools.islice(f, 30, 33):
        print(review)

Original:
I was amused when the Minister talked about the amending report following the error on Derbyshire police pensions. I wondered whether it was another example of problems with the Home Office fax machine. Was it another example of a document going AWOL between the Government Actuary and the Minister's office?

We should not just do what the Minister has just done—talk about serious issues, discussions and meeting people, making the points that Ministers always make—but should look at the background to the debate. A year ago, the position was different: with the Government's having come into office saying how tough they would be on crime, the Minister of State, Home Office, the right hon. Member for Brent, South (Mr. Boateng), said at the Dispatch Box that it was "sterile and simplistic" to talk about police numbers. He told us that there was no link between detection of crime and police numbers.

A year later, the job of the police has become even more difficult; the number of 


Interpreting naive datetime as local 2017-05-04 16:25:32.318079. Please add timezone info to timestamps.



### LDA Topic Modelling

In [151]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings


Interpreting naive datetime as local 2017-05-04 16:25:32.320138. Please add timezone info to timestamps.



In [152]:
trigram_dictionary_filepath = os.path.join(intermediate_directory, 'trigram_dict_all.dict')


Interpreting naive datetime as local 2017-05-04 16:25:32.321524. Please add timezone info to timestamps.



In [153]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to learn the dictionary yourself.
if True:
    trigram_speeches = LineSentence(trigram_speeches_filepath)

    # learn the dictionary by iterating over all of the speeches
    trigram_dictionary = Dictionary(trigram_speeches)
    
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()

    trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)


Interpreting naive datetime as local 2017-05-04 16:25:32.323115. Please add timezone info to timestamps.



CPU times: user 12.4 s, sys: 20 ms, total: 12.4 s
Wall time: 13.8 s


In [154]:
trigram_bow_filepath = os.path.join(intermediate_directory, 'trigram_bow_corpus_all.mm')


Interpreting naive datetime as local 2017-05-04 16:25:32.324590. Please add timezone info to timestamps.



In [155]:
def trigram_bow_generator(filepath):
    """
    generator function to read speeches from a file
    and yield a bag-of-words representation
    """
    
    for speech in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(speech)


Interpreting naive datetime as local 2017-05-04 16:25:32.326055. Please add timezone info to timestamps.



In [156]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to build the bag-of-words corpus yourself.
if True:
    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_speeches_filepath))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)


Interpreting naive datetime as local 2017-05-04 16:25:32.327552. Please add timezone info to timestamps.



CPU times: user 15.9 s, sys: 160 ms, total: 16 s
Wall time: 16.2 s


In [157]:
lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all')


Interpreting naive datetime as local 2017-05-04 16:25:32.328970. Please add timezone info to timestamps.



In [158]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to train the LDA model yourself.
if True:

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=50,
                           id2word=trigram_dictionary,
                           workers=8)
    
    lda.save(lda_model_filepath)
    
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)


Interpreting naive datetime as local 2017-05-04 16:25:32.330317. Please add timezone info to timestamps.



CPU times: user 2min 54s, sys: 30.5 s, total: 3min 25s
Wall time: 4min 56s


In [159]:
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print('{:20} {}'.format('term', 'frequency'))

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print('{:20} {:.3f}'.format(term, round(frequency, 3)))


Interpreting naive datetime as local 2017-05-04 16:25:32.331906. Please add timezone info to timestamps.



In [160]:
LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                              trigram_dictionary)


Interpreting naive datetime as local 2017-05-04 16:25:32.333434. Please add timezone info to timestamps.



In [161]:
pyLDAvis.display(LDAvis_prepared)


Interpreting naive datetime as local 2017-05-04 16:25:32.340611. Please add timezone info to timestamps.

