# **Model**

Import Packages

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import Dataset

import tensorflow as tf

import gensim.downloader as api

import pandas as pd

import numpy as np
from word_embeddings import *

Load in the Data

In [13]:
data = pd.read_csv("trainData2.csv")

# ***Model Creation***

In [14]:
# Step 2: Group the data by Game ID
grouped_data = data.groupby('Game ID')

# Step 3: Extract the 16 words for each game (4 groups, each containing 4 words)
words_per_game = []
for game_id, group in grouped_data:
    # Extract the 16 words in each game, assuming there are exactly 16 words per game
    words = group[['word1', 'word2', 'word3', 'word4']].values.flatten()  # Flatten the 4 columns
    if len(words) == 16:  # Ensure the game has exactly 16 words
        # Split into 4 groups, each containing 4 words
        grouped_words = [words[i:i + 4] for i in range(0, len(words), 4)]
        words_per_game.append(grouped_words)
    else:
        pass
print(words_per_game)

[[array(['PUMP', 'LOAFER', 'BOOT', 'SNEAKER'], dtype=object), array(['SEA', 'WHY', 'ARE', 'QUEUE'], dtype=object), array(['TIME', 'US', 'PEOPLE', 'ESSENCE'], dtype=object), array(['FOOT', 'LEAGUE', 'YARD', 'MILE'], dtype=object)], [array(['LAB', 'POM', 'PIT', 'PEKE'], dtype=object), array(['MOUTH', 'NOSE', 'EYE', 'CHEEK'], dtype=object), array(['AMIGO', 'STOOGE', 'KING', 'TENOR'], dtype=object), array(['WOLF', 'CHOW', 'SCARF', 'GOBBLE'], dtype=object)], [array(['DUST', 'MOP', 'SWEEP', 'VACUUM'], dtype=object), array(['CATS', 'CAROUSEL', 'CHICAGO', 'CABARET'], dtype=object), array(['PUMA', 'NIKE', 'REEBOK', 'ADIDAS'], dtype=object), array(['SPIDER', 'IRON', 'SUPER', 'BAT'], dtype=object)], [array(['MUSTARD', 'PLUM', 'GREEN', 'SCARLET'], dtype=object), array(['TARTAR', 'RELISH', 'KETCHUP', 'MAYO'], dtype=object), array(['PRIME', 'PEACOCK', 'HULU', 'NETFLIX'], dtype=object), array(['BLUE', 'GLUM', 'DOWN', 'LOW'], dtype=object)], [array(['CUB', 'JOEY', 'CALF', 'KID'], dtype=object), array(

Convert each word in each group in each game to a vector and flatten to make into 4800 dimensional numpy array and make one of the 4800 length vectors for each game.

In [15]:
model = api.load("glove-wiki-gigaword-300")

In [16]:
import re

# Check if word is valid (alphabetical characters only)
def is_valid_word(word):
    if isinstance(word, bytes):
        word = word.decode('utf-8', errors='ignore')  # Decode bytes to string, ignore errors
    if isinstance(word, str):
        return bool(re.match(r'^[A-Za-z]+$', word))  # Only letters
    return False  # If the word isn't a string, it's invalid

def get_word_vector(word, model):
    # Dictionary to hold the word vector
    word_vector = np.zeros(300)  # Default to zero vector
    
    # Check if the word is NaN or invalid
    if isinstance(word, float) and np.isnan(word):
        return word_vector  # Return zero vector for NaN
    
    # Validate the word (alphabetical characters only)
    if not is_valid_word(word):
        print(f"Invalid word '{word}' skipped.")
        return word_vector  # Return zero vector for invalid word
    
    # Try to get the word vector from the model
    try:
        # Assuming the model is a word-to-vector model (e.g., GloVe)
        word_vector = model[word.lower()]  # Convert to lowercase to match model's case
    except KeyError:
        print(f"Word '{word}' not found in the model.")
    
    return word_vector


In [17]:
finalVectors = []
trainingVectors = []
targetVectors = []
words = []  # List to store words for each word vector

for game in words_per_game:
    gameVec = []
    flattened_game = []
    target_game = []  # Initialize a list to hold the target vectors for this game
    game_words = []  # Initialize a list to hold the words for this game

    for i, group in enumerate(game):
        groupVec = np.zeros((4, 300))  # Initialize a group with 4 empty vectors (shape 4x300)
        group_target = i  # Assign the group number directly (0 to 3 based on the index)

        # For each word in the group, use the same target
        for word in group:
            if is_valid_word(word):
                word_vector = get_word_vector(word, model)
                word_vector_flat = word_vector.flatten()  # Flatten the word vector

                # Assign the word vector to the selected index (same for all words in the group)
                groupVec[i] = word_vector_flat

                # Add the word to the word list for this group
                game_words.append(word)  # Append word to the list of words

        gameVec.append(groupVec)
        flattened_game.extend(groupVec.flatten())  # Flatten groupVec and append to flattened_game

        # Add the integer target for all words in the group
        target_game.extend([group_target] * len(group))  # Same target for all words in the group

    finalVectors.append(gameVec)
    trainingVectors.append(np.array(flattened_game))  # Flattened game should now be a 1D array
    targetVectors.append(np.array(target_game))  # Store the target vectors for this game
    words.append(game_words)  # Store the words for this game

# Print the target, training vectors, and words for verification
print(f"targetVectors: {targetVectors[0]}")
print(f"trainingVectors: {trainingVectors[0]}")
print(f"words: {words[0]}")

Word 'SELFIE' not found in the model.
Word 'ROFL' not found in the model.
Word 'FIDDLESTICKS' not found in the model.
Word 'SWOLE' not found in the model.
Word 'DRIB' not found in the model.
Word 'KISSCAM' not found in the model.
Word 'SHEEPSHANK' not found in the model.
Word 'LEACHY' not found in the model.
Word 'ENGROSS' not found in the model.
Word 'IDEATE' not found in the model.
Word 'FLOATIE' not found in the model.
Word 'GOODFELLA' not found in the model.
targetVectors: [0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3]
trainingVectors: [ 0.44655001  0.35051    -0.084793   ...  0.57551003 -0.032951
  0.084666  ]
words: ['PUMP', 'LOAFER', 'BOOT', 'SNEAKER', 'SEA', 'WHY', 'ARE', 'QUEUE', 'TIME', 'US', 'PEOPLE', 'ESSENCE', 'FOOT', 'LEAGUE', 'YARD', 'MILE']


In [30]:
import numpy as np
import tensorflow as tf

# Updated WordClusterDataset to reshape target
class WordClusterDataset(tf.data.Dataset):
    def __init__(self, trainingVectors, targetVectors, words):
        """
        Args:
            trainingVectors (list or numpy array): The word vectors for each word in the dataset.
            targetVectors (list or numpy array): The corresponding target cluster labels (integers).
            words (list or array): The original words associated with each word vector.
        """
        self.trainingVectors = np.array(trainingVectors)
        self.targetVectors = np.array(targetVectors)
        self.words = words
    
    def __getitem__(self, idx):
        word_vector = self.trainingVectors[idx]  # Fetch the word vector
        target_cluster = self.targetVectors[idx]  # Fetch the target cluster (label)
        
        # Flatten the target label into a single vector
        target_cluster = np.array(target_cluster).flatten()  # Ensure the target is flattened for each word
        
        return word_vector, target_cluster

    def __len__(self):
        return len(self.trainingVectors)


In [36]:
from tensorflow.keras import layers, models
from tensorflow.keras import backend as K

K.clear_session()

# Define TensorFlow Model with 16 output neurons (one for each word)
def create_model(input_dim, num_clusters, num_words=16):
    model = models.Sequential([
        layers.Dense(2048, activation='relu', input_dim=input_dim),  # Input layer
        layers.Dense(1024, activation='relu'),  # Hidden layer
        layers.Dense(512, activation='relu'),  # Hidden layer
        layers.Dense(num_words * num_clusters, activation='softmax')  # Output layer (16 words * 4 clusters)
    ])
    
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Create the model
input_dim = 4800  # As per your input size (16 * 300 = 4800)
num_clusters = 4  # As per your output size (4 clusters)
model = create_model(input_dim, num_clusters)

# Print model summary to verify the output shape
model.summary()


In [45]:
# Train the model
model.fit(train_tf_dataset, epochs=10)

Epoch 1/10


InvalidArgumentError: Graph execution error:

Detected at node Reshape defined at (most recent call last):
<stack traces unavailable>
Input to reshape is a tensor with 8 values, but the requested shape requires a multiple of 64
	 [[{{node Reshape}}]]
	 [[IteratorGetNext]] [Op:__inference_one_step_on_iterator_4304]