# ***Simple Model Approach***

Import Packages

In [1]:
import gensim.downloader as api
from sklearn.cluster import AgglomerativeClustering, KMeans
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np

Dataset

In [2]:
trainDF = pd.read_csv("trainData2.csv")

Top-Down

In [3]:
def load_embedding_model(embedding_model='word2vec'):
    """Load the appropriate embedding model."""
    return api.load('word2vec-google-news-300')  # This is a large model, can be replaced

In [4]:
model = load_embedding_model(embedding_model='word2vec')

In [5]:
# Sample CSV reading (assuming the CSV is properly formatted)
df = pd.read_csv("trainData2.csv")

In [6]:
class UnsupervisedConnections:
    def __init__(self, num_clusters=5):
        """
        Initializes the UnsupervisedConnections class with clustering configurations.
        """
        self.num_clusters = num_clusters
    
    def agglomerative_clustering(self, embeddings):
        """Perform Agglomerative clustering on the word embeddings."""
        agglomerative = AgglomerativeClustering(n_clusters=self.num_clusters)
        clusters = agglomerative.fit_predict(embeddings)  # Perform clustering
        return clusters

    def kmeans_clustering(self, embeddings):
        """Perform K-means clustering on the word embeddings."""
        kmeans = KMeans(n_clusters=self.num_clusters, random_state=42)
        clusters = kmeans.fit_predict(embeddings)  # Perform clustering
        return clusters

    def dbscan_clustering(self, embeddings, eps=0.5, min_samples=5):
        """Perform DBSCAN clustering on the word embeddings."""
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        clusters = dbscan.fit_predict(embeddings)  # Perform clustering
        return clusters

    def gmm_clustering(self, embeddings):
        """Perform GMM clustering on the word embeddings."""
        gmm = GaussianMixture(n_components=self.num_clusters, random_state=42)
        clusters = gmm.fit_predict(embeddings)  # Perform clustering
        return clusters

    def majority_voting(self, labels_list):
        """
        Aggregate the labels from different clustering algorithms using majority voting.
        """
        labels_arr = np.array(labels_list).T  # Convert to a matrix of labels
        ensemble_labels = []
        for row in labels_arr:
            # Exclude -1 from DBSCAN (consider it as noise)
            row = row[row != -1]
            # Get the most frequent label (mode) in the row (if there's a tie, pick the first)
            if len(row) > 0:
                most_common = np.bincount(row).argmax()  # Most common cluster
            else:
                most_common = -1  # In case all labels are -1 (noise points)
            ensemble_labels.append(most_common)
        return np.array(ensemble_labels)

    def get_word_embeddings(self, model, words):
        """Get word embeddings using the chosen model."""
        embeddings = []
        for word in words:
            if word in model.key_to_index:  # Check if word is in the vocabulary
                embeddings.append(model[word])
            else:
                embeddings.append(np.zeros(model.vector_size))  # Return a zero vector for unknown words
        return np.array(embeddings)

    def cluster_words(self, model, sample_words_data):
        """Cluster the words using multiple clustering algorithms and ensemble their results."""
        final_results = {}

        # Process each problem (list of 5 clusters) separately
        for problem_index, problem in enumerate(sample_words_data):
            print(f"Clustering Problem {problem_index + 1}")

            # Flatten all words from the current problem into a single list
            all_words = [word for cluster in problem for word in cluster]

            # Extract word embeddings for the flattened list of words
            embeddings = self.get_word_embeddings(model, all_words)

            # Apply all clustering models
            agglomerative_clusters = self.agglomerative_clustering(embeddings)
            kmeans_clusters = self.kmeans_clustering(embeddings)
            dbscan_clusters = self.dbscan_clustering(embeddings)
            gmm_clusters = self.gmm_clustering(embeddings)

            # Aggregate the results using majority voting
            combined_clusters = self.majority_voting([agglomerative_clusters, kmeans_clusters, dbscan_clusters, gmm_clusters])

            # Assign clusters to words
            word_cluster_mapping = {}
            idx = 0
            for cluster in problem:
                for word in cluster:
                    word_cluster_mapping[word] = combined_clusters[idx]
                    idx += 1

            # Group words by cluster
            clustered_words = {i: [] for i in range(self.num_clusters)}
            for word, cluster in word_cluster_mapping.items():
                clustered_words[cluster].append(word)

            # Ensure each cluster contains exactly 4 words, redistributing if necessary
            adjusted_clusters = {i: [] for i in range(self.num_clusters)}
            excess_words = []  # To store words that exceed the limit of 4 per cluster

            # Process clusters with more than 4 words
            for cluster, words in clustered_words.items():
                if len(words) > 4:
                    # If the cluster exceeds 4 words, redistribute excess words
                    excess_words.extend(words[4:])
                    adjusted_clusters[cluster] = words[:4]
                else:
                    adjusted_clusters[cluster] = words

            # Redistribute the excess words to the clusters that have less than 4 words
            excess_idx = 0
            for cluster, words in adjusted_clusters.items():
                while len(words) < 4 and excess_idx < len(excess_words):
                    words.append(excess_words[excess_idx])
                    excess_idx += 1

            # Store the result in the final_results dictionary
            final_results[f"Problem {problem_index + 1}"] = adjusted_clusters

        return final_results

In [7]:
# Sample CSV reading (assuming the CSV is properly formatted)
df = pd.read_csv("trainData2.csv")

In [8]:
# Initialize an empty dictionary to hold lists of words for each Game ID
games_dict = {}

# Loop through each row and add words to the respective Game ID list
for _, row in df.iterrows():
    game_id = row['Game ID']
    words = [row['word1'], row['word2'], row['word3'], row['word4']]
    
    if game_id not in games_dict:
        games_dict[game_id] = []
    
    games_dict[game_id].extend(words)

# Convert the dictionary to a list of lists
sample_words_data = list(games_dict.values())

# Initialize the UnsupervisedConnections model and train it
unsupervised_model = UnsupervisedConnections(num_clusters=4)
unsupervised_model.train_word2vec(sample_words_data)  # Train Word2Vec on the provided words

# Cluster the words and get the embeddings
all_clustered_words, all_embeddings = unsupervised_model.cluster_words(sample_words_data)

# Print the clustered words for each game
for i, game_clusters in enumerate(all_clustered_words):
    print(f"Game {i+1}:")
    for cluster, words in game_clusters.items():
        print(f"  Cluster {cluster}: {words}")

AttributeError: 'UnsupervisedConnections' object has no attribute 'train_word2vec'

In [None]:
# Run the clustering
clustered_words = cluster_words(sample_words_data, num_clusters=5)

# Print the results
for problem, clusters in clustered_words.items():
    print(f"\n{problem}:")
    for cluster, words in clusters.items():
        print(f"  Cluster {cluster}: {words}")

Clustering Problem 1
Clustering Problem 2
Clustering Problem 3
Clustering Problem 4

Problem 1:
  Cluster 0: ['computer', 'laptop', 'tablet', 'smartphone']
  Cluster 1: ['dog', 'cat', 'rabbit', 'elephant']
  Cluster 2: ['table', 'chair', 'sofa', 'couch']
  Cluster 3: ['apple', 'banana', 'fruit', 'grape']
  Cluster 4: ['car', 'truck', 'bus', 'motorcycle']

Problem 2:
  Cluster 0: ['actor', 'producer', 'screenwriter', 'developer']
  Cluster 1: ['professor', 'director', 'manager', 'designer']
  Cluster 2: ['chef', 'waiter', 'bartender', 'cook']
  Cluster 3: ['doctor', 'nurse', 'surgeon', 'therapist']
  Cluster 4: ['teacher', 'tutor', 'instructor', 'tester']

Problem 3:
  Cluster 0: ['apple', 'banana', 'orange', 'grape']
  Cluster 1: ['chocolate', 'vanilla', 'mint', 'carrot']
  Cluster 2: ['pizza', 'burger', 'pasta', 'fries']
  Cluster 3: ['spinach', 'broccoli', 'lettuce', 'strawberry']
  Cluster 4: ['beef', 'chicken', 'pork', 'lamb']

Problem 4:
  Cluster 0: ['lion', 'tiger', 'cheetah', '