# <font color="#49699E" size=40>Can We Model Meaning? Contextual Representation and Neural Word Embeddings</font>
# LEARNING OBJECTIVES
# LEARNING MATERIALS
# INTRODUCTION


# CAN WE MODEL MEANING?
## The Distributional Hypothesis


# WHAT ARE NEURAL WORD EMBEDDINGS?


## Learning Embeddings with Word2Vec


# CULTURAL CARTOGRAPHY: GETTING A FEEL FOR VECTOR SPACE

## King - Man + Woman $\neq$ Queen

In [None]:
from whatlies import Embedding
from whatlies.embeddingset import EmbeddingSet
from whatlies.language import SpacyLanguage
lang = SpacyLanguage('en_core_web_md')

import pandas as pd
pd.set_option("display.notebook_repr_html", False)
from dcss.utils import list_files, IterSents, mp_disk
from dcss.text import bigram_process

import gensim
from multiprocessing import Process, Manager
from gensim.utils import simple_preprocess

import matplotlib.pyplot as plt
from dcss.plotting import custom_seaborn
custom_seaborn()

In [None]:
(lang['queen'] - lang['king']).plot(kind='arrow', color='lightgray', show_ops=True)
(lang['king'] + lang['woman'] - lang['man']).plot(kind='arrow', color='lightgray', show_ops=True)

lang['man'].plot(kind='arrow', color='crimson')
lang['woman'].plot(kind='arrow', color='crimson')

lang['king'].plot(kind='arrow', color='black')
lang['queen'].plot(kind='arrow', color='black')

plt.axis('off');
plt.show()

In [None]:
print("Queen and King: " + str(lang['queen'].distance(lang['king'])))
print("Man and Woman: " + str(lang['man'].distance(lang['woman'])))
print("Man and King: " + str(lang['man'].distance(lang['king'])))
print("Woman and King: " + str(lang['woman'].distance(lang['king'])))

In [None]:
king_woman_no_man = lang['king'] + lang['woman'] - lang['man']
print("King and combo-vector:" + str(lang['king'].distance(king_woman_no_man)))
print("Queen and combo-vector: " + str(lang['queen'].distance(king_woman_no_man)))

In [None]:
# rename the combination vector because the original ('man') would be used for the plot
king_woman_no_man.orig = king_woman_no_man.name 

king_queen_man_woman_plus = EmbeddingSet(lang['king'], lang['queen'], 
                                         lang['man'], lang['woman'], king_woman_no_man)

king_queen_man_woman_plus.plot_interactive(x_axis=lang["king"], 
                                           y_axis=lang["queen"], 
                                           axis_metric = 'cosine_similarity')

In [None]:
print("Woman and Queen: " + str(lang['woman'].distance(lang['queen'])))
print("Woman and Queen without man: " + str((lang['woman']-lang['man']).distance(lang['queen'])))

In [None]:
print("Woman and Queen without man: " + str(Embedding('halfway', lang['woman'].vector-lang['man'].vector*0.5).distance(lang['queen'])))

# LEARNING EMBEDDINGS WITH GENSIM


## Data


In [None]:
datasets = list_files("../data/canadian_hansards/lipad/", 'csv')
len(datasets)

In [None]:
def get_sentences(dataset):
    
    dfs = [pd.read_csv(df) for df in dataset]  
    speeches = []
    
    for df in dfs:
        speeches.extend(df['speechtext'].tolist())
    speeches = [str(s).replace('\n|\r', ' ') for s in speeches]     
    _, sentences = bigram_process(speeches, n_process = 1)    
    sentences = '\n'.join(sentences)  
    
    q.put(sentences)

In [None]:
m = Manager()
q = m.Queue()
mp_disk(datasets, get_sentences, '../data/txt_files/can_hansard_speeches.txt', q)

In [None]:
with open('../data/txt_files/can_hansard_speeches.txt') as file:
    data = file.read()
    words = data.split()
    print(len(words))

In [None]:
sentences = IterSents('../data/txt_files/can_hansard_speeches.txt')

model = gensim.models.Word2Vec(sentences, size = 300, window = 4, iter = 5, 
                               sg = 0, min_count = 10, negative = 5, workers = 4)

In [None]:
vocabulary = sorted(list(model.wv.vocab))

with open('../models/model_vocabulary.txt', 'w') as f:
    for v in vocabulary:
        f.write(v)
        f.write('\n')

model.save('../models/word2vec.model')

In [None]:
model = gensim.models.Word2Vec.load('../models/word2vec.model')
model = model.wv

# COMPARING EMBEDDINGS
## Imports

In [None]:
from twec.twec import TWEC
from gensim.models.word2vec import Word2Vec
import pandas as pd
from dcss.utils import list_files, mp_disk

from tok import Tokenizer
from gensim.utils import simple_preprocess
from multiprocessing import Process, Manager
import re

## Aligning Your Vector Spaces!
## Step 1: Train the Compass


In [None]:
compass_path = '../data/txt_files/can_hansard_speeches.txt'

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
aligner = TWEC(size = 300, siter = 5, diter = 5, window = 10, sg = 0, min_count = 10, ns = 5, workers = 4)
aligner.train_compass(compass_path, overwrite=False)

## Step 2: Train a Series of Aligned Embedding Models
### Research on Cultural Change with Temporal Embeddings


In [None]:
datasets = list_files("../data/canadian_hansards/lipad/", 'csv')
len(datasets)

In [None]:
canadian_1990s = []
canadian_2000s = []
canadian_2010s = []

for i in range(1990,1999):
    year_data = '../data/canadian_hansards/lipad/' + str(i) + '/'
    datasets_1990s = list_files(year_data, 'csv')
    canadian_1990s.extend(datasets_1990s)
    
for i in range(2000,2009):
    year_data = '../data/canadian_hansards/lipad/' + str(i) + '/'
    datasets_2000s = list_files(year_data, 'csv')
    canadian_2000s.extend(datasets_2000s)
    
for i in range(2010,2019):
    year_data = '../data/canadian_hansards/lipad/' + str(i) + '/'
    datasets_2010s = list_files(year_data, 'csv')
    canadian_2010s.extend(datasets_2010s)

In [None]:
m = Manager()
q = m.Queue()
mp_disk(canadian_1990s, get_sentences, '../data/txt_files/1990s_speeches.txt', q)

In [None]:
m = Manager()
q = m.Queue()
mp_disk(canadian_2000s, get_sentences, '../data/txt_files/2000s_speeches.txt', q)

In [None]:
m = Manager()
q = m.Queue()
mp_disk(canadian_2010s, get_sentences, '../data/txt_files/2010s_speeches.txt', q)

In [None]:
model_1990s = aligner.train_slice('../data/txt_files/1990s_speeches.txt', save=True)

In [None]:
model_2000s = aligner.train_slice('../data/txt_files/2000s_speeches.txt', save=True)

In [None]:
model_2010s = aligner.train_slice('../data/txt_files/2010s_speeches.txt', save=True)

In [None]:
model_1990s = Word2Vec.load('../models/1990s_speeches.model')
model_2000s = Word2Vec.load('../models/2000s_speeches.model')
model_2010s = Word2Vec.load('../models/2010s_speeches.model')

In [None]:
model_1990s.wv.most_similar(positive = 'climate_change', topn = 10)

In [None]:
model_2000s.wv.most_similar(positive = 'climate_change', topn = 10)

In [None]:
model_2010s.wv.most_similar(positive = 'climate_change', topn = 10)

### Cross-sectional Comparisons: Political Parties on Climate Change


In [None]:
liberal = ['Liberal']
conservative = ['Conservative', 'Canadian Alliance', 'Progressive Conservative', 'Reform']
ndp = ['New Democratic Party']

In [None]:
def get_sentences_by_party(dataset, filter_terms):
    
    dfs_unfiltered = [pd.read_csv(df) for df in dataset]
    dfs = []  
    
    for df in dfs_unfiltered:
        temp_df = df.dropna(subset = ['speakerparty'])
        mask = temp_df['speakerparty'].apply(lambda x: any(party for party in filter_terms if party in x))
        temp_df2 = temp_df[mask]
        if len(temp_df2) > 0:
            dfs.append(temp_df2)
        
    speeches = []
    
    for df in dfs:
        speeches.extend(df['speechtext'].tolist())
    speeches = [str(s).replace('\n|\r', ' ') for s in speeches]   # make sure everything is a lowercase string, remove newlines    
    _, sentences = u.bigram_process(speeches)    
    sentences = '\n'.join(sentences)  # join the batch of sentences with newlines into 1 string
    
    q.put(sentences)

In [None]:
m = Manager()
q = m.Queue()

mp_disk(datasets, get_sentences_by_party, '../data/txt_files/liberal_speeches.txt', q, liberal)

In [None]:
m = Manager()
q = m.Queue()

mp_disk(datasets, get_sentences_by_party, '../data/txt_files/conservative_speeches.txt', q, conservative)

In [None]:
m = Manager()
q = m.Queue()

mp_disk(datasets, get_sentences_by_party, '../data/txt_files/ndp_speeches.txt', q, ndp)

In [None]:
model_liberal = aligner.train_slice('../data/txt_files/liberal_speeches.txt', save=True)

In [None]:
model_conservative = aligner.train_slice('../data/txt_files/conservative_speeches.txt', save=True)

In [None]:
model_ndp = aligner.train_slice('../data/txt_files/ndp_speeches.txt', save=True)

In [None]:
model_liberal = Word2Vec.load('../models/liberal_speeches.model')
model_conservative = Word2Vec.load('../models/conservative_speeches.model')
model_ndp = Word2Vec.load('../models/ndp_speeches.model')

In [None]:
model_liberal.wv.most_similar(positive = 'climate_change', topn = 10)

In [None]:
model_conservative.wv.most_similar(positive = 'climate_change', topn = 10)

In [None]:
model_ndp.wv.most_similar(positive = 'climate_change', topn = 10)

# CONCLUSION
## Key Points 
