In [157]:
import pandas as pd
from itertools import combinations
from collections import Counter
from sklearn.preprocessing import LabelEncoder
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from tensorflow import keras
import random

In [180]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Force TensorFlow to use CPU

In [181]:
df = pd.read_csv('data.csv')

In [182]:
# Function to extract unique words from messages
def extract_unique_words(messages):
    words = ' '.join(messages).split()  # Combine all messages and split into words
    return set(words)  # Return unique words

In [183]:
# Group the dataset by species and apply the function to extract unique words
species_groups = df.groupby('species')['message']
species_unique_words = species_groups.apply(extract_unique_words)

# Convert the result into a DataFrame for better readability
unique_words_df = pd.DataFrame({
    'species': species_unique_words.index,
    'unique_words': species_unique_words.values
})

# Reset the index to make it cleaner
unique_words_df.reset_index(drop=True, inplace=True)

# Display the resulting DataFrame
# print(unique_words_df)
unique_words_df.to_csv('unique_words.csv', index=False)


In [184]:
def find_common_words(species_list):
    # Get the unique words for the first species in the list
    common_words = species_unique_words[species_list[0]]
    
    # Intersect the words with the rest of the species in the list
    for species in species_list[1:]:
        common_words = common_words.intersection(species_unique_words[species])
    
    return common_words

In [185]:
def find_different_words(species1, species2):
    # Get the unique words for the first species in the list
    different_words = species_unique_words[species1]

    # Find the difference between the words of the first species and the second species
    different_words = different_words.difference(species_unique_words[species2])
    
    return different_words

In [186]:
# pairs = [('Aquari', 'Florian'), ('Zorblax', 'Quixnar'), ('Faerix', 'Mythron'), ('Nexoon', 'Cybex'), ('Emotivor', 'Sentire')]

In [187]:
species = df['species'].unique()
species_combinations = list(combinations(species, 2))
common_words = {}

for species_combination in species_combinations:
    com_words = find_common_words(species_combination)
    if len(com_words) > 0:
        common_words[species_combination[0]] = com_words
        common_words[species_combination[1]] = com_words

common_words_df = pd.DataFrame({
    'species_combination': common_words.keys(),
    'common_words': common_words.values()
})

common_words_df.reset_index(drop=True, inplace=True)

common_words_df.to_csv('common_words.csv', index=False)

In [188]:
different_words = {}

for species_combination in pairs:
    different_words[species_combination[0]] = find_different_words(species_combination[0], species_combination[1])
    different_words[species_combination[1]] = find_different_words(species_combination[1], species_combination[0])

different_words = pd.DataFrame({
    'species_combination': different_words.keys(),
    'different_words': different_words.values()
})

different_words.to_csv('different_words.csv', index=False)

In [189]:
all_words = set()
for spec_unique_words in species_unique_words.values:
    all_words = all_words.union(spec_unique_words)

all_words = list(all_words)

# Tokenize the words
tokenizer = LabelEncoder()
word_indices = tokenizer.fit_transform(all_words)
vocab_size = len(all_words)

# Create a dictionary to map words to indices
species_word_indices = {species: tokenizer.fit_transform(list(spec_unique_words)) for species, spec_unique_words in species_unique_words.items()}
common_words_indices = {species: tokenizer.fit_transform(list(spec_common_words)) for species, spec_common_words in common_words.items()}

In [190]:
def triplet_loss(y_true, y_pred, alpha=0.2):
    # print(y_pred.shape)
    anchor, positive, negative = y_pred[:, 0], y_pred[:, 1], y_pred[:, 2]
    pos_dist = K.sum(K.square(anchor - positive), axis=-1)
    neg_dist = K.sum(K.square(anchor - negative), axis=-1)
    loss = K.maximum(pos_dist - neg_dist + alpha, 0)
    return K.mean(loss)

In [191]:
def create_triplets(species_word_indices, common_word_indices):
    triplets = []
    
    for species, word_indices in species_word_indices.items():
        # For each word in the species
        for word_idx in word_indices:
            # Positive: A different word from the same species or a common word
            pos_idx = random.choice([idx for idx in word_indices if idx != word_idx])
            
            # Negative: A word from a different species or a common word
            neg_species = random.choice([s for s in species_word_indices if s != species])
            neg_idx = random.choice(list(species_word_indices[neg_species]) + list(common_word_indices[species]))
            
            # Append the triplet (anchor, positive, negative)
            triplets.append((word_idx, pos_idx, neg_idx))
    
    return np.array(triplets)

In [None]:
# Generate triplets for training, factoring in common words
triplets = create_triplets(species_word_indices, common_words_indices)

embedding_dim = 32

# Define the embedding model
input_layer = Input(shape=(1,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_layer)
flatten_layer = Flatten()(embedding_layer)
embedding_model = Model(inputs=input_layer, outputs=flatten_layer)

# Define inputs for the triplet loss model
anchor_input = Input(shape=(1,), name='anchor_input', dtype='int64')
positive_input = Input(shape=(1,), name='positive_input', dtype='int64')
negative_input = Input(shape=(1,), name='negative_input', dtype='int64')
anchor_embedding = embedding_model(anchor_input)
positive_embedding = embedding_model(positive_input)
negative_embedding = embedding_model(negative_input)

print(anchor_input.shape, positive_input.shape, negative_input.shape)

In [198]:
keras.config.disable_traceback_filtering()

In [None]:
triplet_model = Model(inputs=[anchor_input, positive_input, negative_input], outputs=[anchor_embedding, positive_embedding, negative_embedding])
triplet_model.compile(optimizer=Adam(), loss=triplet_loss)

# Train the model
triplet_model.fit([triplets[:, 0], triplets[:, 1], triplets[:, 2]], np.zeros(len(triplets)), epochs=10)


In [None]:
# # Get the final embeddings
embeddings = embedding_model.predict(np.array(tokenizer.fit_transform(all_words)))

In [201]:
word_embeddings = pd.DataFrame(embeddings, index=all_words)
word_embeddings.to_csv('word_embeddings.csv')