In [29]:
import numpy as np
import pandas as pd
import string
from tqdm import tqdm
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt

## Read Data


In [30]:
with open('stopwords.txt') as f:
    stopwords = f.read().replace('\n', ' ').split()

In [31]:
with open('training_text.txt', encoding='utf-8') as f:
    text = f.read().replace('\n', '')
    print(text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ''.join([t for t in text if t not in list('0123456789')])
    text = text.replace('”', '').replace(
        '“', '').replace('’', '').lower().split()
    print(text)

Today we will be learning about the fundamentals of data science and statistics. Data Science and statistics are hot and growing fields with alternative names of machine learning, artificial intelligence, big data, etc. I'm really excited to talk to you about data science and statistics because data science and statistics have long been a passions of mine. I didn't used to be very good at data science and statistics but after studying data science and statistics for a long time, I got better and better at it until I became a data science and statistics expert. I'm really excited to talk to you about data science and statistics, thanks for listening to me talk about data science and statistics.
['today', 'we', 'will', 'be', 'learning', 'about', 'the', 'fundamentals', 'of', 'data', 'science', 'and', 'statistics', 'data', 'science', 'and', 'statistics', 'are', 'hot', 'and', 'growing', 'fields', 'with', 'alternative', 'names', 'of', 'machine', 'learning', 'artificial', 'intelligence', 'big

## Prepare our Training Data


In [32]:
WINDOW_SIZE = 3
NUM_NEGATIVE_SAMPLES = 3

data = []
# iterate over all words
for idx, center_word in enumerate(text[WINDOW_SIZE-1:-WINDOW_SIZE]):

    # iterate over the context words around the center word
    context_words = [context_word for context_word in text[idx:idx + 2*WINDOW_SIZE-1] if context_word != center_word]
    for context_word in context_words:

        # get words NOT in the current context as negative examples
        data.append([center_word, context_word, 1])
        negative_samples = np.random.choice(
            [w for w in text[WINDOW_SIZE-1:-WINDOW_SIZE] if w != center_word and w not in context_words], NUM_NEGATIVE_SAMPLES)

        for negative_samp in negative_samples:

            # add a training row
            data.append([center_word, negative_samp, 0])

In [33]:
df = pd.DataFrame(columns=['center_word', 'context_word', 'label'], data=data)
words = np.intersect1d(df.context_word, df.center_word)
df = df[(df.center_word.isin(words)) & (
    df.context_word.isin(words))].reset_index(drop=True)

In [None]:
df

In [36]:
def sigmoid(v,scale=1):
    return 1/(1+np.exp(-scale*v))

In [None]:
def updateEmbedding(df, main_embeddings, context_embeddings, learning_rate, debug=False):
    # get differences between main_embeddings and context_embeddings
    main_embeddings_center = main_embeddings.loc(df.center_word).values
    context_embeddings_context = context_embeddings.loc(df.context_word).values
    diffs = context_embeddings_context - main_embeddings_center

    # get the similarity, scores and the errors between main_embeddings_center and context_embeddings_center
    dot_prods = np.sum(main_embeddings_center * context_embeddings_context,axis = 1)
    scores = sigmoid(dot_prods )
    errors = (df.label - scores).values.reshape(-1,1)
    
    # Calculate the updates
    updates = diffs*errors*learning_rate
    updates_df = pd.DataFrame(data = updates)
    