# Spacy

## Part of Speech Tagging

In [None]:
import en_core_web_sm
import spacy
from scipy.spatial.distance import cosine
nlp = en_core_web_sm.load()

In [None]:
import pandas as pd
rows = []
doc = nlp(u"Steve Jobs and Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    rows.append((token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop))
    
data = pd.DataFrame(rows, columns=["text", "lemma", "part_of_speech", "tag", "dependency", "shape", "is_alphanumeric", "is_stopword"])
data.head()

### Named Entity Recognition

In [None]:
doc = nlp(u"Steve Jobs and Apple is looking at buying U.K. startup for $1 billion")
import en_core_web_sm
import spacy
from scipy.spatial.distance import cosine
nlp = en_core_web_sm.load()

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
# visualize this using displacy:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

# Word Embeddings (word2vec Introduction)

## Continuous Bag of Words (Use Context to Predict Target Word)
![alt text](images/word2vec_cbow.png "Logo Title Text 1")

## Softmax
![alt text](images/softmax.png "Logo Title Text 1")

## Skipgram
![alt text](images/skipgram.png "Logo Title Text 1")

## Softmax
![alt text](images/wordembedding_cluster.png "Logo Title Text 1")

In [None]:
import en_core_web_sm
import spacy
from scipy.spatial.distance import cosine
nlp = en_core_web_sm.load()

In [None]:
tokens = nlp(u'dog cat Beijing sad depressed couch sofa canine China Chinese France Paris banana')

for token1 in tokens:
    for token2 in tokens:
        if token1 != token2:
            print(f" {token1} - {token2}: {1 - cosine(token1.vector, token2.vector)}")

# Finding Most Similar Words (Using Our Old Methods)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# inspect the default settings for CountVectorizer
CountVectorizer()

In [None]:
reviews = open("poor_amazon_toy_reviews.txt").readlines()

vectorizer = CountVectorizer(ngram_range=(1, 1), 
                             stop_words="english", 
                             max_features=500,token_pattern='(?u)\\b[a-zA-Z][a-zA-Z]+\\b')
X = vectorizer.fit_transform(reviews)

data = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
data.head()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# create similiarity matrix
similarity_matrix = pd.DataFrame(cosine_similarity(data.T.values), 
             columns=vectorizer.get_feature_names(),
                                 index=vectorizer.get_feature_names())

In [None]:
# unstack matrix into table
similarity_table = similarity_matrix.rename_axis(None).rename_axis(None, axis=1).stack().reset_index()

In [None]:
# rename columns
similarity_table.columns = ["word1", "word2", "similarity"]
similarity_table.shape

In [None]:
similarity_table = similarity_table[similarity_table["similarity"] < 0.99]
similarity_table.shape

In [None]:
similarity_table.sort_values(by="similarity", ascending=False).drop_duplicates(
    subset="similarity", keep="first").head(10)

In [None]:
top_500_words = vectorizer.get_feature_names()

# Exercise: Similar Words Using Word Embeddings

In [None]:
# load into spacy your top 500 words

tokens = nlp(f'{" ".join(top_500_words)}')

In [None]:
from itertools import product
# create a list of similarity tuples

similarity_tuples = []

for token1, token2 in product(tokens, repeat=2):
    similarity_tuples.append((token1, token2, token1.similarity(token2)))

similarities = pd.DataFrame(similarity_tuples, columns=["word1","word2", "score"])


In [None]:
# find similar words
similarities[similarities["score"] < 1].sort_values(
    by="score", ascending=False).drop_duplicates(
    subset="score", keep="first").head(5)

# Finding Most Similar Sentences

In [None]:
# get vectors for each review





# Distributions

In [None]:
%matplotlib inline
import seaborn as sns
import numpy as np





## Optimization Techniques

### Subsampling

What do we do with highly frequent words like `the` or `of`? We don't gain a ton of meaning from training on these words, and they become computationally expensive since they appear so frequently:

![alt text](images/subsampling.png "http://mccormickml.com/2017/01/11/word2vec-tutorial-part-2-negative-sampling/")
In the image above, $z(w_i)$ is the frequency of that particular word divided by the total number of words in the entire corpus. For instance, if a corpus of text has 50 words, and the word `dog` appears 3 times, $z(w_{dog}) = 0.06$.

In [None]:
import numpy as np
# write subsampling function



# plot this function:
import matplotlib.pyplot as plt





## Negative Sampling

If you have 50,000 words in your vocabulary, you need to make 2 x 50,000 updates to your model for each context word for each target word! This is an incredibly costly calculation. For the most part, we don't need to make frequent updates to the majority of the corpus. For instance, if our context word is `midterm` and our target word is `study`, do we really need to speed CPU time computing the gradients for `elephant`? 

In practice, we will only sample 4-5 negative samples (where the target output is 0).

## Limitations of Word Embeddings

#### How to handle **Out Of Vocabulary (OOV)** words?
Although **word2vec** and **FastText** include a significant vocabulary size, there will inevitably be words that are not included. For instance, if you are analyzing text conversations using word embeddings pretrained on Wikipedia text (which typically has more formal vocabulary than everyday language), how will you account for the following words?

- DM
- ROFLMAO
- bae
- 😃
- #10YearChallenge
- wut

#### Potential solution: use word embeddings if they are available, and otherwise initialize the weights to random.

```python
import numpy as np
def vectorize_word(input_word: str, D=50):
    """
    D: an integer that represents the length (dimensionality of the word embeddings)
    word_embeddings: A dictionary object with the string word as the key, and the embedding vector of 
    length D as the values.
    For instance, word_embeddings["cat"] will return [2.3, 4.5, 6.1, -2.2, ...]
    """
    if input_word in word_embeddings.keys():
        return word_embeddings[input_word]
    else:
        return np.random.rand(D)
```

##### Should we update the word embedding matrices during the model training step?
- Ideally, you'd only want to be able to update the specific weights that were randomly initialized (since the rest of the weights are by definition pre-trained and are already pretty good). However, most deep learning libraries do not allow you to easily select which specific weight elements to apply backpropagation to- you either update all weights or you update none. In practice, most data scientists will "freeze" the word embedding layer:

In Keras:
```python
word_embedding_layer.trainable = False # by default, trainable is set to true in Keras
```
In Tensorflow:
```python
import tensorflow as tf
N = 300 # number of words
D = 50 # of dimensions in embeddings
initial_word_embeddings = [0, 1, 2, 3, 4, 5, 6, 7]
tensor = tf.constant(initial_word_embeddings, shape=[N, D])
```

- Ambiguity around **Domain-specific words**: using a generic pre-trained word embedding will not capture the semantic meaning of the word **sack** when it is used in the context of American football:
![sack](images/football-bag-sack-diff.png)

# Gensim

In [None]:
# from https://radimrehurek.com/gensim/models/word2vec.html






In [None]:
from gensim.models import Word2Vec







In [None]:
# load in the entire Google News word embedding vectors
from gensim.models import KeyedVectors
filename = 'GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

# word analogies




In [None]:
# get the most similar words for a target word





# FastText

### When to use?

- traditionally, each individual word is trained onto a new word embedding
- in many languages (including English), many words are morphologically derivative from each other. 
- use case when your corpus contains high-value, morphologically diverse, rare words (`photosynthesis`, `transcendentalism`)

In [None]:
import fasttext
model = fasttext.skipgram('complete-shakespeare.txt', 'model')

In [None]:
model.cosine_similarity("woman", "man")

### FastText Hyperparameters (From [Tutorial Notebook](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/FastText_Tutorial.ipynb))
- **model**: Training architecture. Allowed values: `cbow`, `skipgram` (Default `cbow`)
- **size**: Size of embeddings to be learnt (Default 100)
- **alpha**: Initial learning rate (Default 0.025)
- **window**: Context window size (Default 5)
- **min_count**: Ignore words with number of occurrences below this (Default 5)
- **loss**: Training objective. Allowed values: `ns`, `hs`, `softmax` (Default `ns`)
- **sample**: Threshold for downsampling higher-frequency words (Default 0.001)
- **negative**: Number of negative words to sample, for `ns` (Default 5)
- **iter**: Number of epochs (Default 5)
- **sorted_vocab**: Sort vocab by descending frequency (Default 1)
- **threads**: Number of threads to use (Default 12)

Hyperparameters unique to `fasttext`:
- **min_n**: min length of char ngrams (Default 3)
- **max_n**: max length of char ngrams (Default 6)
- **bucket**: number of buckets used for hashing ngrams (Default 2000000)

In [None]:
from gensim.models import FastText

In [None]:
import pandas as pd
from nltk import word_tokenize
text = list(pd.read_csv("bbc-text.csv")["text"].values)

new_text = [word_tokenize(story) for story in text]

In [None]:
model = FastText(size=200, window=4, min_count=2)  # change the size of the windows
model.build_vocab(sentences=new_text)
model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)

In [None]:
# get corpus total count
model.corpus_count

In [None]:
# get word vector for dog
model.wv["dog"]

In [None]:
# get length of word embeddings
len(model["king"])

In [None]:
model.most_similar("france")
model.most_similar("dog")
model.most_similar("transc")