# <font color="#49699E" size=40>Exploratory Text Analysis</font>
# LEARNING OBJECTIVES
# LEARNING MATERIALS
# INTRODUCTION
## Package Imports

In [None]:
import pickle
from pprint import pprint
import pandas as pd
pd.set_option("display.notebook_repr_html", False)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from dcss.text import bigram_process, preprocess, bow_to_df
from dcss.plotting import format_axes_commas, custom_seaborn
from dcss.utils import sparse_groupby
custom_seaborn()

import spacy
nlp = spacy.load('en_core_web_sm')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
import scipy

# SCALING UP: PROCESSING POLITICAL SPEECHES


In [None]:
columns = ['speech', 'speakername', 'party', 'constituency', 'year']
uk_df = pd.read_csv("../data/british_hansards/hansard-speeches-v301.csv", usecols=columns)
uk_df.dropna(subset=['party', 'speakername', 'speech'], inplace=True)

uk_df = uk_df.query('year > 2016')
uk_df['party'].value_counts()

In [None]:
parties_keep = [
    'Conservative', 
    'Labour', 
    'Scottish National Party', 
    'Labour (Co-op)',
    'Liberal Democrat',
    'Democratic Unionist Party',
    'Plaid Cymru',
    'Green Party'
]

party_subset = uk_df[uk_df['party'].isin(parties_keep)].copy()
party_subset.reset_index(drop=True, inplace=True)

total_speech_counts = party_subset['party'].value_counts()
total_speech_counts

In [None]:
sampled_speeches = party_subset.groupby('party').sample(replace=False,
                                                        frac=.3,
                                                        random_state=23)

len(sampled_speeches)

In [None]:
with open('../data/pickles/sampled_british_hansard_speeches.pkl', 'wb') as fp:
    pickle.dump(sampled_speeches, fp)

In [None]:
sampled_speech_counts = sampled_speeches['party'].value_counts()

sample_sizes = pd.DataFrame(zip(total_speech_counts, sampled_speech_counts),
                            columns=['Total', 'Sample'],
                            index=parties_keep)

In [None]:
sampled_speeches['speech_len'] = sampled_speeches['speech'].apply(lambda x: len(x.split(" ")))

In [None]:
parties = sampled_speeches.groupby('party')

def party_subplot(subgroup, title, position):
    sns.kdeplot(ax = position, data=subgroup, x='speech_len',
                log_scale=True, fill=True, alpha=.5, linewidth=0, color='black')
    position.set(xlabel='Number of tokens (log scale)', title=title)
    
fig, ax = plt.subplots(2, 4, sharex=True, sharey=True, figsize=(10, 4))
party_subplot(parties.get_group('Conservative'), 'Conservative', ax[0,0])
party_subplot(parties.get_group('Labour'), 'Labour', ax[0,1])
party_subplot(parties.get_group('Scottish National Party'), 'Scottish National Party', ax[0,2])
party_subplot(parties.get_group('Labour (Co-op)'), 'Labour (Co-op)', ax[0,3])
party_subplot(parties.get_group('Liberal Democrat'), 'Liberal Democrat', ax[1,0])
party_subplot(parties.get_group('Democratic Unionist Party'), 'Democratic Unionist Party', ax[1,1])
party_subplot(parties.get_group('Plaid Cymru'), 'Plaid Cymru', ax[1,2])
party_subplot(parties.get_group('Green Party'), 'Green Party', ax[1,3])

plt.tight_layout()
plt.show()

In [None]:
parties['speech_len'].median()

## From Rule-Based Chunks and Triplets to Statistically Dependant n-grams


In [None]:
bigram_model, preprocessed = preprocess(sampled_speeches['speech'], nlp=nlp, bigrams=True, 
                                        detokenize = True, n_process=4)
len(preprocessed)

In [None]:
with open('../data/pickles/processed_sample_british_party_subset_hansards.pkl', 'wb') as fp:
    pickle.dump(preprocessed, fp)
    
with open('../data/pickles/sample_british_party_subset_hansard_bigram_model.pkl', 'wb') as fp:
    pickle.dump(bigram_model, fp)

In [None]:
with open ('../data/pickles/processed_sample_british_party_subset_hansards.pkl', 'rb') as fp:
    preprocessed = pickle.load(fp)

In [None]:
sampled_speeches.iloc[700]['speech']

In [None]:
sampled_speeches['preprocessed'] = preprocessed
sampled_speeches.iloc[700]['preprocessed']

# CREATING DTMs WITH SKLEARN


## Count Vectorization


In [None]:
count_vectorizer = CountVectorizer(max_df=.1,
                                   min_df=3,
                                   strip_accents='ascii',
                                   )

In [None]:
count_matrix = count_vectorizer.fit_transform(preprocessed)
vocabulary = count_vectorizer.get_feature_names()

count_matrix.shape

In [None]:
with open('../data/pickles/brit_hansards_sample_party_subset_count_matrix.pkl', 'wb') as fp:
    pickle.dump(count_matrix, fp)
    
with open('../data/pickles/brit_hansards_sample_party_subset_vocabulary.pkl', 'wb') as fp:
    pickle.dump(vocabulary, fp)

### Comparing Token Frequencies and Proportions


In [None]:
count_data = pd.DataFrame.sparse.from_spmatrix(count_matrix)
count_data.columns = vocabulary

count_data.index = sampled_speeches['party']
count_data.shape 

In [None]:
print('sparse size: ' + str(count_data.memory_usage().sum()/1048576) + "MB")
print('sparse density : ' + str(count_data.sparse.density))

In [None]:
party_counts = sparse_groupby(sampled_speeches['party'], count_matrix, vocabulary)
results = party_counts.div(party_counts.sum(axis=1), axis=0)
results_t = results.T
results_t.sample(20, random_state=10061986)

In [None]:
search_term = 'scotland'
results_t.loc[search_term].sort_values(ascending=False)

In [None]:
n_top_words = 5
top_words_per_party = {}

for party in results_t.columns:
    top = results_t[party].nlargest(n_top_words)
    top_words_per_party[party] = list(zip(top.index, top))
     
for k, v in top_words_per_party.items():
    print(k.upper())
    for each in v:
        print(each)
    print('\n')

In [None]:
diff_con_snp = results_t['Conservative'] - results_t['Scottish National Party']
diff_con_snp.sort_values(ascending=False, inplace=True)

In [None]:
con_not_snp = diff_con_snp.head(20) # Conservatives but not SNP
con_not_snp

In [None]:
lab_not_snp = diff_con_snp.tail(20) # SNP but not Conservatives
lab_not_snp

In [None]:
dop = pd.concat([con_not_snp, lab_not_snp])

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
sns.swarmplot(x=dop, y=dop.index, color='black', size=4)
ax.axvline(0) # add a vertical line at 0
plt.grid()  # add a grid to the plot to make it easier to interpret
ax.set(xlabel=r'($\longleftarrow$ Scottish National Party)        (Conservative Party $\longrightarrow$)',
       ylabel='',
       title='Difference of Proportions')
plt.tight_layout()
plt.show()

# CONCLUSION
## Key Points 
