# <font color="#49699E" size=40>Text Similarity and Latent Semantic Space</font>
# LEARNING OBJECTIVES
# LEARNING MATERIALS
# INTRODUCTION
## Package Imports

In [None]:
import pickle
from pprint import pprint
import pandas as pd
pd.set_option("display.notebook_repr_html", False)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from dcss.text import bigram_process, preprocess, bow_to_df, get_topic_word_scores
from dcss.plotting import format_axes_commas, custom_seaborn
from dcss.utils import sparse_groupby
custom_seaborn()

import spacy
nlp = spacy.load('en_core_web_sm', disable=['ner'])

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

## TF-IDF Vectorization


In [None]:
with open ('../data/pickles/processed_sample_british_party_subset_hansards.pkl', 'rb') as fp:
    preprocessed = pickle.load(fp)

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=.1,
                                   min_df=3,
                                   strip_accents='ascii')

tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed) 
tfidf_matrix.shape

In [None]:
with open ('../data/pickles/brit_hansards_sample_party_subset_count_matrix.pkl', 'rb') as fp:
    count_matrix = pickle.load(fp)

tfidf_scores = np.ravel(tfidf_matrix.sum(0))
tfidf_scores = tfidf_scores/np.linalg.norm(tfidf_scores)
term_counts = np.ravel(count_matrix.sum(0))
term_counts = term_counts/np.linalg.norm(term_counts)
vocabulary = tfidf_vectorizer.get_feature_names()


df = pd.DataFrame({'Term': vocabulary, 'TFIDF': tfidf_scores, 'Count': term_counts})
df.sort_values(by='TFIDF', ascending=False, inplace=True)

In [None]:
sns.jointplot(data=df.head(5000), x='Count', y='TFIDF', kind='hist')
plt.show()

In [None]:
tfidf_vectorizer = TfidfVectorizer(strip_accents='ascii', sublinear_tf=True)

tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed)
tfidf_matrix.shape

## Computing Semantic Similarity and Clustering Documents


In [None]:
with open ('../data/pickles/sampled_british_hansard_speeches.pkl', 'rb') as fp:
    speech_df = pickle.load(fp)
    
party_names = speech_df['party']
tfidf_vocabulary = tfidf_vectorizer.get_feature_names()
party_scores = sparse_groupby(party_names, tfidf_matrix, tfidf_vocabulary)

In [None]:
len(party_names)

In [None]:
normalize = Normalizer()
party_scores_n = normalize.fit_transform(party_scores)

In [None]:
sim_matrix = party_scores_n @ party_scores_n.T
sim_df = pd.DataFrame.sparse.from_spmatrix(sim_matrix).sparse.to_dense()

In [None]:
np.fill_diagonal(sim_df.values, np.nan)
sim_df.values[np.tril_indices(sim_df.shape[0], -1)] = np.nan

In [None]:
sim_df.index = party_scores.index
sim_df.columns = party_scores.index

In [None]:
print(sim_df.stack().nlargest(3))

In [None]:
print(sim_df.stack().nsmallest(3))

In [None]:
party_scores_df = pd.DataFrame.sparse.from_spmatrix(party_scores_n)
party_scores_df.index = party_scores.index
party_scores_df.columns = tfidf_vectorizer.get_feature_names()

for party in ['Labour','Liberal Democrat', 'Democratic Unionist Party', 'Plaid Cymru']:
    print(party + '\n')
    print(party_scores_df.loc[party].nlargest(10))
    print('\n')

# EXPLORING LATENT SEMANTIC SPACE WITH MATRIX DECOMPOSITION


## Latent Semantic Analysis (LSA) with Singular Value Decomposition (SVD)


### LSA via SVD in sklearn


In [None]:
lsa = TruncatedSVD(n_components=100, n_iter=6, random_state=12)

In [None]:
lsa = lsa.fit(tfidf_matrix)

In [None]:
svs = lsa.singular_values_[:20]
svs

In [None]:
word_topics = pd.DataFrame(lsa.components_).T # transpose the dataframe so WORDS are in the rows
column_names = [f'Topic {c}' for c in np.arange(1,101,1)]
word_topics.columns = column_names

word_topics.shape

In [None]:
terms = tfidf_vectorizer.get_feature_names()
word_topics.index = terms

word_topics.sort_values(by='Topic 2', ascending = False)['Topic 2'].head(20)

In [None]:
compare_df = pd.DataFrame()

compare_terms = ['england', 'scotland', 'wale', 'ireland']

for i, term in enumerate(compare_terms):
    scores = word_topics.loc[term].sort_values(ascending=False)
    compare_df[i] = scores.index
    compare_df[term] = scores.values

In [None]:
compare_df.head()

In [None]:
word_topics.loc['scotland'].sort_values(ascending=False)

In [None]:
get_topic_word_scores(word_topics, 10, 'Topic 8')    

# CONCLUSION
## Key Points 
