In [None]:
import re
from nltk.corpus import stopwords
import numpy as np

def get_file_data(stop_word_removal='no'):
    # Loading the file
    with open('Dataset to compute Word Embeddings (Tiny).txt') as f:
        file_contents = f.read()

    # Word extraction
    words = re.findall(r'\b\w+\b', file_contents)

    # Optionally remove stop words
    if stop_word_removal == 'yes':
        stop_words = set(stopwords.words('english'))  # Assuming English text
        words = [word for word in words if word not in stop_words]

    # Joining words into a string again
    text = ' '.join(words)
    
    return text


'RAte the Black Coffee chronicle three out of 6 stars She me movie times at Mann Theatres Please book a brasserie restaurant for eight in Ireland Find the novel WWE Legends of WrestleMania I rate American History A Survey a 5 Play Circus Farm by Deana Carter Play Pandora on Last Fm play Party Ben on Slacker Play some music from the last album of 1988 on Lastfm What is the forecast for Dec 1st 2036 in Keeneland What is the weather going to be like on st patrick s day Tell me when it will be warmer in Woods Hole Oklahoma add this tune to my fresh finds hiptronix playlist What are the Mann Theatres showtimes for Secret Sunshine Add the avispa track to my Bass Gaming playlist book a spot for krista yolanda and I in New Mexico Is Babar King of the Elephants playing Is it possible to see Tube at the closest movie theatre Will there be rainfall at one PM in Catahoula Add david cole to an instrumental sunday I feel like this essay deserves four stars Where is Nichiren to Mōko Daishūrai playing

In [None]:
def generate_dictionary_data(text):
    # Here we're converting the text to a list of words
    # and then creating a dictionary from it.
    corpus = text.split()
    word_to_index = {word: index for index, word in enumerate(set(corpus))}
    index_to_word = {index: word for word, index in word_to_index.items()}

    return word_to_index, index_to_word, corpus, len(word_to_index), len(corpus)

 

In [None]:
def get_one_hot_vectors(target_word, context_words, vocab_size, word_to_index):
    # Initialize the vectors with zeros
    target_word_vector = np.zeros(vocab_size)
    context_word_vector = np.zeros(vocab_size)

    # Set the word ID to 1
    target_word_vector[word_to_index[target_word]] = 1

    # Do the same for context words
    for word in context_words:
        context_word_vector[word_to_index[word]] = 1

    return target_word_vector, context_word_vector

target_word = "chronicle"
context_words = ["Black", "Coffee", "three", "out"]




In [None]:
def generate_training_data(corpus, window_size, vocab_size, word_to_index, length_of_corpus, sample=None):
    training_data =  []
    training_sample_words =  []

    for i, word in enumerate(corpus):
        index_target_word = i
        target_word = word
        context_words = []

        #when target word is the first word
        if i == 0:  
            # trgt_word_index:(0), ctxt_word_index:(1,2)
            context_words = [corpus[x] for x in range(i + 1 , window_size + 1)] 

        #when target word is the last word
        elif i == len(corpus)-1:
            # trgt_word_index:(9), ctxt_word_index:(8,7), length_of_corpus = 10
            context_words = [corpus[x] for x in range(length_of_corpus - 2 ,length_of_corpus -2 - window_size  , -1 )]

        #When target word is the middle word
        else:
            #Before the middle target word
            before_target_word_index = index_target_word - 1
            for x in range(before_target_word_index, before_target_word_index - window_size , -1):
                if x >=0:
                    context_words.extend([corpus[x]])

            #After the middle target word
            after_target_word_index = index_target_word + 1
            for x in range(after_target_word_index, after_target_word_index + window_size):
                if x < len(corpus):
                    context_words.extend([corpus[x]])

        trgt_word_vector,ctxt_word_vector = get_one_hot_vectors(target_word,context_words,vocab_size,word_to_index)
        
        # This check ensures that vectors are not just zeros
        if np.sum(trgt_word_vector) > 0 and np.sum(ctxt_word_vector) > 0:
            training_data.append([trgt_word_vector,ctxt_word_vector])

        if sample is not None:
            training_sample_words.append([target_word,context_words])   
        
    return training_data,training_sample_words

    

In [None]:
def forward_prop(weight_inp_hidden, weight_hidden_output, target_word_vector):
    # Compute the forward propagation step
    hidden_layer = np.dot(weight_inp_hidden.T, target_word_vector)
    output_layer = np.dot(weight_hidden_output.T, hidden_layer)

    return softmax(output_layer), hidden_layer, output_layer


def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)


def calculate_error(y_pred, context_words):
    # Compute the difference between predicted and actual context word vectors
    error = y_pred - context_words
    return error

def backward_prop(weight_inp_hidden, weight_hidden_output, error, hidden_layer, target_word_vector, learning_rate):
    # Compute the backward propagation step
    dl_weight_inp_hidden = np.outer(target_word_vector, np.dot(weight_hidden_output, error.T))
    dl_weight_hidden_output = np.dot(hidden_layer, error.T)

    # Update weights
    weight_inp_hidden = weight_inp_hidden - (learning_rate * dl_weight_inp_hidden)
    weight_hidden_output = weight_hidden_output - (learning_rate * dl_weight_hidden_output)

    return weight_inp_hidden, weight_hidden_output


In [None]:


def calculate_loss(u, ctx):
    # We will calculate the negative log likelihood as it is used for multi-class classification problems
    sum_1 = -1 * np.sum(u[ctx == 1])
    sum_2 = len(np.where(ctx == 1)[0]) * np.log(np.sum(np.exp(u)))
    total_loss = sum_1 + sum_2
    return total_loss

def main():
    text = get_file_data()
    word_to_index, index_to_word, corpus, vocab_size, length_of_corpus = generate_dictionary_data(text)

    # Define your window size and sample here
    window_size = 2  
    sample = None 

    training_data, training_sample_words = generate_training_data(corpus, window_size, vocab_size, word_to_index, length_of_corpus, sample)

    # Define the size of the hidden layer
    hidden_layer_size = 50

    # Initialize the weights
    weight_inp_hidden = initialize_weights(vocab_size, hidden_layer_size)
    weight_hidden_output = initialize_weights(hidden_layer_size, vocab_size)

    # Define number of epochs for training
    epochs = 50

    # Define learning rate
    learning_rate = 0.01

    for epoch in range(epochs):
        total_error = 0
        for target, context in training_data:
            # Forward propagation
            y_pred, h, u = forward_prop(weight_inp_hidden, weight_hidden_output, target)
            
            # Calculate error
            EI = calculate_error(y_pred, context)
            
            # Sum up the error
            total_error += np.sum(EI)
            
            # Backward propagation
            weight_inp_hidden, weight_hidden_output = backward_prop(EI, h, weight_inp_hidden, weight_hidden_output, target, learning_rate)

        print(f"Epoch: {epoch+1}, Error: {total_error}")

def initialize_weights(input_layer_size, hidden_layer_size):
    # Initialize the weights randomly
    weight_matrix = np.random.rand(input_layer_size, hidden_layer_size)
    return weight_matrix

if __name__ == "__main__":
    main()



ValueError: ignored

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def visualize_embeddings(word_to_index, embeddings):
    words = list(word_to_index.keys())
    vectors = [embeddings[word_to_index[word]] for word in words]
    
    tsne = TSNE(n_components=2, random_state=42)
    vectors_2d = tsne.fit_transform(vectors)
    
    plt.figure(figsize=(10,10))
    for i, word in enumerate(words):
        plt.scatter(vectors_2d[i, 0], vectors_2d[i, 1])
        plt.annotate(word, (vectors_2d[i, 0], vectors_2d[i, 1]))
    plt.show()


visualize_embeddings(word_to_index, weight_inp_hidden)


NameError: ignored