## Analyse all house of commons speeches by current MPs

In [46]:
import spacy
import pandas as pd
import json
from bs4 import BeautifulSoup
from collections import Counter
import requests

from config import TWFY_API_KEY

  new_obj[k] = extract_dates(v)


In [None]:
# Get MP -> theyworkforyou_id map
# Probably need to modify after general election to point to previous batch of MPs
mps = pd.read_csv("https://www.theyworkforyou.com/mps/?f=csv")

In [2]:
def get_mp_speeches(mp_id):
    """Get speeches of a particular MP based on TheyWorkForYou id and convert data into long format pandas data frame.
    Each row represents one speech at a particular date and time"""
    all_speeches = pd.DataFrame()
    rows = [1]
    page_no=1
    while len(rows) > 0:
        t = requests.get("https://www.theyworkforyou.com/api/getDebates?key={api_key}&\
                     type=commons&person={person}&results_per_page=1000&num={num}&page={page}&output=js".format(api_key=TWFY_API_KEY,
                                                                                                               person=mp_id,
                                                                                                               num=1000,
                                                                                                               page=page_no))
        rows = t.json()["rows"]
        speeches = []
        # Loop over each row
        for row in rows:
            speeches.append({
                    'speech_id':row["gid"],
                    'speech_url':row["listurl"],
                    'mp_name':row["speaker"]["name"],
                    'mp_constituency':row["speaker"]["constituency"],
                    'mp_party':row["speaker"]["party"],
                    'mp_id':row["person_id"],
                    'date':pd.to_datetime(row["hdate"], format="%Y-%m-%d"),
                    'time':row["htime"],
                    'section_id':row["section_id"],
                    'subsection_id':row["subsection_id"],
                    'debate_title':row["parent"]["body"],
                    'body':BeautifulSoup(row["body"], "html5lib").get_text()
                })
        speeches = pd.DataFrame(speeches)

        # Concatenate onto complete speeches dataframe
        all_speeches = pd.concat([all_speeches, speeches], ignore_index=True)
        # Increment page_counter
        page_no += 1
    
    print("Got speeches for MP {0}".format(mp_id))
    return all_speeches

In [3]:
## Download all MP speeches if this is set to True
if False:
    # Parallelise downloading of MP speeches
    from multiprocessing import Pool

    # Number of threads to use to fetch
    NUM_THREADS = 8
    # Make list of mp ids
    list_of_mp_ids = list(mps["Person ID"])

    # Create pool of threads
    pool = Pool(NUM_THREADS)
    # Use pool.map to download speeches mp by mp
    results = pool.map(get_mp_speeches, list_of_mp_ids)
    pool.close()
    pool.join()

    # Concatenate all mps into one dataframe
    all_mp_speeches = pd.concat(results)
    
    # Write data to a file to save it
    all_mp_speeches.to_hdf("mp_speeches.h5", "speeches")
else:
    ## Read in mp speeches that have been previously downloaded
    all_mp_speeches = pd.read_hdf("mp_speeches.h5", "speeches")

In [4]:
all_mp_speeches["time_"] = pd.to_datetime(all_mp_speeches.time, format="%H:%M:%S", errors="coerce")

In [5]:
all_mp_speeches["time_hour"] = all_mp_speeches.time.str.split(":", expand=True).get(0)

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go
import cufflinks as cf

In [None]:
# Load english language model from spacy
nlp = spacy.load("en")

### Run data through NLP

In [7]:
import pyLDAvis
import pyLDAvis.sklearn

In [8]:
# Create new corpus object to store all text data
corpus = textacy.Corpus("en")

  new_obj[k] = extract_dates(v)


In [9]:
all_mp_speeches_sample = all_mp_speeches.sample(frac=0.1)

  new_obj[k] = extract_dates(v)


In [10]:
del all_mp_speeches

  new_obj[k] = extract_dates(v)


In [14]:
all_mp_speeches_metadata = all_mp_speeches_sample.drop("body", 1)
corpus.add_texts(texts=all_mp_speeches_sample["body"], n_threads=8)
del all_mp_speeches_sample # To save memory

  new_obj[k] = extract_dates(v)


In [33]:
doc_term_matrix, id2term = textacy.vsm.doc_term_matrix(
     (doc.to_terms_list(named_entities=False, as_strings=True, normalise="lemma", filter_stops=True, filter_punct=True, filter_nums=True)
      for doc in corpus),
     weighting='tfidf', normalize=True, smooth_idf=True, min_df=3, max_df=0.95, max_n_terms=100000)

  new_obj[k] = extract_dates(v)


In [44]:
list(corpus[0].to_terms_list(named_entities=True, as_strings=True, normalise="lemma", filter_stops=True, filter_punct=True, filter_nums=True))

  new_obj[k] = extract_dates(v)


['house',
 'speaker',
 '14-minute',
 'back - bench',
 'house',
 'order',
 'remind',
 'house',
 'mr.',
 'speaker',
 'place',
 '14-minute',
 'time',
 'limit',
 'bench',
 'speech',
 'describe',
 'hon',
 'member',
 'late',
 'surge',
 'interest',
 'participate',
 'debate',
 'touch',
 'generous',
 'hope',
 'hon',
 'member',
 'bear',
 'mind',
 'addres',
 'house',
 'mr. speaker',
 '14-minute time',
 'time limit',
 'bench speech',
 'late surge',
 'remind the house',
 'house that mr.',
 'speaker have place',
 'place a 14-minute',
 '14-minute time limit',
 'describe as hon',
 'surge of interest',
 'interest in participate',
 'hope that hon',
 'member will bear',
 'addres the house']

In [35]:
model = textacy.tm.TopicModel('lda', n_topics=30, max_iter=30)
model.fit(doc_term_matrix)

  new_obj[k] = extract_dates(v)


In [36]:
doc_topic_matrix = model.transform(doc_term_matrix)

  new_obj[k] = extract_dates(v)


In [37]:
top_term_matrix = model.model.components_
doc_lengths = [len(doc) for doc in corpus]
vocab = list(id2term.values())
term_frequency = textacy.vsm.get_term_freqs(doc_term_matrix)

vis_data = pyLDAvis.prepare(
    top_term_matrix,
    doc_topic_matrix,
    doc_lengths,
    vocab,
    term_frequency,
)

  new_obj[k] = extract_dates(v)


In [38]:
pyLDAvis.enable_notebook()

  new_obj[k] = extract_dates(v)


In [39]:
vis_data

  new_obj[k] = extract_dates(v)
