In [1]:
import pandas as pd
import numpy as np
import os

# Read in Data

In [2]:
df = pd.read_pickle("C:\\Users\\caneu\\Downloads\\embeddings_and_info.pkl")

In [3]:
df1 = pd.read_pickle("C:\\Users\\caneu\\Downloads\\embeddings_and_info2.pkl")

In [4]:
df = pd.concat([df, df1])

In [5]:
meta_data = pd.read_pickle('C:\\Users\\caneu\\Downloads\\meta_data_embeddings.pkl')

### I discovered that descriptions which mention anchor are all typically the same as the podcast provider adds on a little blurb on the end that can mess with similarity scores so I remove these episodes for the data set

In [6]:
new_meta = meta_data[~meta_data['episode_description'].str.contains('anchor', case=False, na=False)]

In [7]:
# Assuming 'rw' is the column in df from which you want to strip '.json'
df['episode_id'] = df['episode_id'].str.rstrip('.json')


In [151]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   show_id     50000 non-null  object
 1   episode_id  50000 non-null  object
 2   transcript  50000 non-null  object
 3   embeddings  50000 non-null  object
dtypes: object(4)
memory usage: 1.5+ MB


In [288]:
new_meta.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63610 entries, 0 to 105358
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   show_uri                 63610 non-null  object 
 1   show_name                63610 non-null  object 
 2   show_description         63610 non-null  object 
 3   publisher                63610 non-null  object 
 4   language                 63610 non-null  object 
 5   rss_link                 63610 non-null  object 
 6   episode_uri              63610 non-null  object 
 7   episode_name             63610 non-null  object 
 8   episode_description      63406 non-null  object 
 9   duration                 63610 non-null  float64
 10  show_filename_prefix     63610 non-null  object 
 11  episode_filename_prefix  63610 non-null  object 
 12  name_embeds              63610 non-null  object 
 13  name_embeds1             63610 non-null  object 
 14  description_embeds   

In [8]:
import pandas as pd

# Assuming 'episode_id' is the common column in df and 'episode_filename_prefix' is the common column in meta_data
merged_df = pd.merge(df, new_meta, left_on='episode_id', right_on='episode_filename_prefix')


## Merge meta data descriptions and name embeddings with transcript embeddings

In [291]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27687 entries, 0 to 27686
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   show_id                  27687 non-null  object 
 1   episode_id               27687 non-null  object 
 2   transcript               27687 non-null  object 
 3   embeddings               27687 non-null  object 
 4   show_uri                 27687 non-null  object 
 5   show_name                27687 non-null  object 
 6   show_description         27687 non-null  object 
 7   publisher                27687 non-null  object 
 8   language                 27687 non-null  object 
 9   rss_link                 27687 non-null  object 
 10  episode_uri              27687 non-null  object 
 11  episode_name             27687 non-null  object 
 12  episode_description      27600 non-null  object 
 13  duration                 27687 non-null  float64
 14  show_filename_prefix  

In [9]:
embeddings_array = np.array(merged_df['embeddings'].tolist())
name_embeds1_array = np.array(merged_df['name_embeds1'].tolist())
description_embeds_array = np.array(merged_df['description_embeds'].tolist())

# Concatenate the arrays along axis=2 (third dimension)
combined_embeddings = np.concatenate([embeddings_array, name_embeds1_array, description_embeds_array], axis=2)

# Reshape the resulting 3D tensor to a 2D array for cosine similarity calculation
combined_embeddings_2d = combined_embeddings.reshape(-1, combined_embeddings.shape[2])


In [10]:
combined_embeddings_2d.shape

(27687, 2304)

## chose to combine embeddings by concatenation. Other methods also exist like sum and average

In [12]:
embeddings_test = combined_embeddings_2d


In [13]:
embeddings_np = np.array([np.array(e).flatten() for e in embeddings_test])
from sklearn.preprocessing import normalize

normalized_embedding = normalize(embeddings_np, norm='l2')


In [15]:
normalized_embedding.shape

(27687, 2304)

# Normalized embeddings after flattening

In [237]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


sims = cosine_similarity(normalized_embedding)

# Similarity with Siamese

In [113]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [238]:
# Simulating embeddings and similarity matrix
embeddings = normalized_embedding[:2000]
#[:800]
similarity_matrix = sims[:2000]
#[:800]
#ensure each embedding has same shape
target_embedding_shape = (2304,)
embeddings_resized = [np.resize(embedding, target_embedding_shape) for embedding in embeddings]




In [239]:
# Split data into training and testing sets
train_embeddings, test_embeddings, train_similarity_matrix, test_similarity_matrix = train_test_split(
    embeddings_resized, similarity_matrix, test_size=0.2, random_state=2024
)

In [266]:

import torch.nn.functional as F
# Custom Dataset class
class SiameseDataset(Dataset):
    def __init__(self, embeddings, similarity_matrix):
        self.embeddings = embeddings
        self.similarity_matrix = similarity_matrix

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.similarity_matrix[idx]


class SiameseNetwork(nn.Module):
    def __init__(self, embedding_dim=32, hidden_dim=128, dropout_prob=0.5):
        super(SiameseNetwork, self).__init__()

        self.shared_embedding = nn.Linear(embedding_dim, hidden_dim)
        self.fc1 = nn.Linear(hidden_dim, 64)
        self.fc2 = nn.Linear(hidden_dim, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(p=dropout_prob)

    def forward_one(self, x):
        x = torch.relu(self.shared_embedding(x))
        x = self.dropout(x)  # Apply dropout
        x = torch.relu(self.fc1(x))
        return x

    def forward(self, input1, input2):
        output1 = self.forward_one(input1)
        output2 = self.forward_one(input2)
        l1_distance = torch.abs(output1 - output2)
        output = torch.sigmoid(self.fc3(l1_distance))
        return output


# Training loop
def train_siamese_model(model, train_dataloader, num_epochs=10, learning_rate=0.001, device='cpu'):
    model.to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        # Use tqdm to create a progress bar for the outer loop
        with tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch') as t:
            for i, (data_i, target_i) in enumerate(t):
                for j, (data_j, target_j) in enumerate(train_dataloader):
                    if i != j:
                        data_i, target_i = data_i.to(device), target_i.to(device)
                        data_j, target_j = data_j.to(device), target_j.to(device)

                        optimizer.zero_grad()
                        output_i = model(data_i[:, 0], data_i[:, 1]).squeeze()
                        
                        #print(data_j[:, 0].shape, data_j[:, 1].shape)
                        output_j = model(data_j[:, 0], data_j[:, 1]).squeeze()

                        loss_i = criterion(output_i, target_i)
                        loss_j = criterion(output_j, target_j)

                        loss = (loss_i + loss_j) / 2
                        loss.backward()
                        optimizer.step()

                        # Update the progress bar description with the latest loss
                        t.set_postfix(loss=loss.item())

        print(f"Epoch {epoch + 1}/{num_epochs}, Final Loss: {loss.item()}")



# Evaluation function
def evaluate_siamese_model(model, test_dataloader, device='cpu'):
    model.eval()
    mse_loss = nn.MSELoss()

    total_loss = 0.0
    total_samples = 0

    with torch.no_grad():
        for data, target in test_dataloader:
            data, target = data.to(device), target.to(device)

            output = model(data[:, 0], data[:, 1]).squeeze()
            loss = mse_loss(output, target)
            
            total_loss += loss.item() * data.size(0)
            total_samples += data.size(0)

    average_loss = total_loss / total_samples
    print(f"Test MSE Loss: {average_loss}")






### Performed many iterations on full data and subsets to experimentally determine the best hyper parameters

In [267]:
# Create datasets and dataloaders
train_dataset = SiameseDataset(train_embeddings, train_similarity_matrix)
test_dataset = SiameseDataset(test_embeddings, test_similarity_matrix)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)


# Create and train the Siamese model
siamese_model = SiameseNetwork()
train_siamese_model(siamese_model, train_dataloader)


  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 1/10: 100%|██████████| 50/50 [00:48<00:00,  1.04batch/s, loss=0.000586]


Epoch 1/10, Final Loss: 0.0005863613914698362


Epoch 2/10: 100%|██████████| 50/50 [00:46<00:00,  1.07batch/s, loss=0.000914]


Epoch 2/10, Final Loss: 0.0009144514333456755


Epoch 3/10: 100%|██████████| 50/50 [00:47<00:00,  1.05batch/s, loss=0.000718]


Epoch 3/10, Final Loss: 0.0007176126819103956


Epoch 4/10: 100%|██████████| 50/50 [00:47<00:00,  1.06batch/s, loss=0.00058] 


Epoch 4/10, Final Loss: 0.000579961109906435


Epoch 5/10: 100%|██████████| 50/50 [00:49<00:00,  1.02batch/s, loss=0.000423]


Epoch 5/10, Final Loss: 0.00042324449168518186


Epoch 6/10: 100%|██████████| 50/50 [00:49<00:00,  1.01batch/s, loss=0.000529]


Epoch 6/10, Final Loss: 0.0005294568836688995


Epoch 7/10: 100%|██████████| 50/50 [00:49<00:00,  1.01batch/s, loss=0.000531]


Epoch 7/10, Final Loss: 0.0005312708672136068


Epoch 8/10: 100%|██████████| 50/50 [00:49<00:00,  1.01batch/s, loss=0.000398]


Epoch 8/10, Final Loss: 0.0003979144967161119


Epoch 9/10: 100%|██████████| 50/50 [00:49<00:00,  1.01batch/s, loss=0.00044] 


Epoch 9/10, Final Loss: 0.0004397771554067731


Epoch 10/10: 100%|██████████| 50/50 [00:49<00:00,  1.01batch/s, loss=0.000427]

Epoch 10/10, Final Loss: 0.0004269569180905819





In [268]:
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, drop_last = True)
evaluate_siamese_model(siamese_model, test_dataloader)

Test MSE Loss: 0.000627488077346546


In [275]:
target_embedding_shape = 32
#number below in new_embedding code is the embedding index for the similarity

all_embeddings = [np.resize(embedding, target_embedding_shape) for embedding in normalized_embedding]
all_embeddings = normalize(all_embeddings, norm='l2')
new_embedding = all_embeddings[1400]

## Had to resize embeddings in order to find most similar

In [276]:

def find_most_similar_embeddings(new_embedding, all_embeddings, siamese_model, device='cpu'):
    # Ensure the model is in evaluation mode
    siamese_model.eval()

    # Convert new_embedding to torch tensor (assuming it's a NumPy array)
    new_embedding = torch.tensor(new_embedding, dtype=torch.float32).to(device)

    # Compute similarity scores for each embedding in all_embeddings
    similarity_scores = []
    with torch.no_grad():
        for index, candidate_embedding in enumerate(all_embeddings):
            candidate_embedding = torch.tensor(candidate_embedding, dtype=torch.float32).to(device)

            # Compute similarity score using the Siamese network
            similarity_score = siamese_model(new_embedding, candidate_embedding).item()
            similarity_scores.append((index, similarity_score))

    # Rank embeddings based on similarity scores
    ranked_embeddings = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    return ranked_embeddings

In [277]:
result = find_most_similar_embeddings(new_embedding, all_embeddings, siamese_model)

# Print the top N most similar embeddings
top_n = 5
print(new_embedding)
for i, (index, similarity_score) in enumerate(result[:top_n], 1):
    print(f"Rank {i}: Similarity Score = {similarity_score:.4f}, Index = {index}, Embedding = {all_embeddings[index]}")

[0.20413543 0.157709   0.20493902 0.1179009  0.23180876 0.15412084
 0.15055615 0.26148681 0.12425503 0.12514287 0.15591911 0.14552698
 0.09738209 0.1690598  0.15691232 0.21368616 0.18639392 0.11575423
 0.1696905  0.23380276 0.28433459 0.13568801 0.15041693 0.27849924
 0.16959643 0.21558517 0.11130428 0.11497336 0.11964039 0.0865617
 0.2201813  0.14801854]
Rank 1: Similarity Score = 0.9214, Index = 21872, Embedding = [ 0.14415032  0.17037932  0.30246338  0.10554048  0.15577215  0.18374927
  0.13197063  0.28999314  0.09131534 -0.05203622  0.15121206  0.01859102
  0.06332877  0.36348518  0.16999856  0.24653199  0.11396783  0.14490973
  0.04528691  0.19597573  0.20218719  0.20663859  0.11656036  0.21562957
  0.20373682  0.14003292  0.08962645  0.15911535  0.08444207  0.21075596
  0.20120348  0.15282271]
Rank 2: Similarity Score = 0.9210, Index = 9244, Embedding = [0.18175234 0.16592793 0.30060937 0.15962347 0.11377095 0.2603173
 0.16744441 0.25871898 0.11980185 0.1167338  0.05810829 0.0654

In [280]:
merged_df['episode_description'][1400]

'For our final episode, were talking about healing.\xa0'

In [279]:
merged_df['episode_description'][21872]

'Introduction to Rastafari '