In [None]:
import numpy as np

# Use only NumPy’s RNG to avoid any name collisions
np.random.seed(42)

# Define your word categories
word_categories = {
    'animals':    ['cat', 'dog', 'lion', 'tiger', 'elephant', 'giraffe', 'zebra', 'monkey'],
    'fruits':     ['apple', 'banana', 'orange', 'mango', 'grape', 'strawberry', 'kiwi', 'pineapple'],
    'colors':     ['red', 'blue', 'green', 'yellow', 'purple', 'orange', 'pink', 'black'],
    'emotions':   ['happy', 'sad', 'angry', 'excited', 'calm', 'nervous', 'joyful', 'anxious'],
    'weather':    ['sunny', 'rainy', 'cloudy', 'windy', 'stormy', 'foggy', 'snowy', 'clear']
}

def generate_training_data(window_size=2, num_samples=1000):
    """
    Generates (context_words, center_word) pairs.
    Uses NumPy for random sampling to avoid any clashes with a shadowed 'random'.
    """
    categories = list(word_categories.keys())
    n_cats = len(categories)
    data = []
    
    for _ in range(num_samples):
        # pick a random category index
        cat_idx = np.random.randint(0, n_cats)
        words = word_categories[categories[cat_idx]]
        
        # ensure we can pick a center word with full window on both sides
        min_i = window_size
        max_i = len(words) - window_size - 1
        center_i = np.random.randint(min_i, max_i + 1)
        
        # slice out the context words
        left  = words[center_i - window_size : center_i]
        right = words[center_i + 1 : center_i + window_size + 1]
        context = left + right
        
        center_word = words[center_i]
        data.append((context, center_word))
    
    return data

# Generate the data
training_data = generate_training_data()

# Show a few examples
print("Some training examples (context → center):")
for ctx, ctr in training_data[:5]:
    print(f"  {ctx} → {ctr}")

# Build vocabulary
all_words = [w for ctx, ctr in training_data for w in ctx + [ctr]]
vocab = sorted(set(all_words))

# Create mappings
word_to_index = {w: i for i, w in enumerate(vocab)}
index_to_word = {i: w for w, i in word_to_index.items()}

print("\nVocabulary size:", len(vocab))
print("\nSample mappings:")
for w, idx in list(word_to_index.items())[:5]:
    print(f"  {w}: {idx}")


In [None]:
print(word_to_index)
print(index_to_word)
print(training_data)

In [None]:

# Create a dictionary to store word appearances
word_appearances = {}


# Iterate through training data to record word positions
for index,word in enumerate(vocab):
    list=np.zeros(len(vocab))
    list[index] = 1
    word_appearances[word] = list
   

print(word_appearances)




In [None]:
print(training_data)
len(training_data)

def convert_to_onehot(training_data,vocab,word_appearances):
    onehotcoded_data = np.zeros((len(vocab),len(training_data)))
    onehotcoded_y=np.zeros((len(vocab),len(training_data)))
    for i in range(len(training_data)):
        temp=np.zeros(len(vocab))
        temp_y=np.zeros(len(vocab))
        for j in range(len(training_data[i][0])):
            
            temp+=word_appearances[training_data[i][0][j]]
            
        temp_y+=word_appearances[training_data[i][1]]
      
        onehotcoded_y[:,i]=temp_y
       
        onehotcoded_data[:,i]=temp
    
    
    return onehotcoded_data,onehotcoded_y







In [7]:
class Word2Vec:
    def __init__(self, vocab_size, embedding_dim):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.W = np.random.randn(embedding_dim, vocab_size)
        self.W_ = np.random.randn(vocab_size, embedding_dim)
        self.b=np.zeros((embedding_dim,1))
        self.b_=np.zeros((vocab_size,1))
        
   
        

    def forward(self, X):
        self.X = X
        self.Z = np.dot(self.W,self.X)+self.b
        self.Y = np.dot(self.W_,self.Z)+self.b_
        self.prediction = self.softmax(self.Y)
        
        
    def softmax(self,x):
        x_shifted=x-np.max(x)
        return np.exp(x_shifted)/np.sum(np.exp(x_shifted))


    def cross_entropy_cost(self,y,y_hat):
          epsilon = 1e-15
          y_hat = np.clip(y_hat, epsilon, 1 - epsilon)
          cost = -np.sum(y*np.log(y_hat))/y.shape[1]
          return cost

    def backward(self,y,learning_rate):
       
        self.dZ = self.prediction - y
        self.dW_ = np.dot(self.dZ,self.Z.T)/self.dZ.shape[1]
        self.db=np.sum(self.dZ,axis=1,keepdims=True)/self.dZ.shape[1]
        dZ = np.dot(self.W_.T, self.dZ)
        self.dW = np.dot(dZ,self.X.T)/self.dZ.shape[1]
        self.db_=np.sum(self.dZ,axis=1,keepdims=True)/self.dZ.shape[1]
        self.W = self.W - self.dW*learning_rate
        self.W_ = self.W_ - self.dW_*learning_rate
        
        # Should be (5,3) but is (5,5)
        
        
   
    def train(self,X,y,learning_rate,epochs):
        for i in range(epochs):
            
            
           
            self.forward(X)
            cost=self.cross_entropy_cost(y,self.prediction)
            print(f"Cost: {cost}")
            self.backward(y,learning_rate)
            print(f"Epoch {i+1} completed")
    def predict(self,X):
        self.forward(X)
        return self.prediction
    def get_embedding(self,word,vocab):
        index = vocab.index(word)
        print(f"Embedding for {word}: {self.W[:,index]}")
        return self.W[:,index]
    







        

In [None]:
X,y=convert_to_onehot(training_data,vocab,word_appearances)

from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X.T, y.T, test_size=0.2, random_state=42)


# Update the model training to use the training split
X_train=X_train.T
y_train=y_train.T
model = Word2Vec(vocab_size=len(vocab), embedding_dim=3)

model.train(X_train, y_train, learning_rate=0.0001, epochs=1000)




model.get_embedding("angry",vocab)






