In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import ast
import networkx as nx
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px


# General Functions

# Finding Cosine Simularity Name and Description of Trope

In [2]:
df = pd.read_csv('../data_clean/embeddings.csv')

In [3]:
df.head()

Unnamed: 0,d_embedding,n_embedding,trope_name
0,"[0.06963171064853668, 0.0030629639513790607, -...","[0.05705825984477997, -0.043769609183073044, 0...",Aardvark Trunks
1,"[0.015573586337268353, 0.0399489663541317, 0.0...","[0.009882059879601002, 0.03227100521326065, -0...",Abandoned Area
2,"[0.03458022698760033, 0.0070857820101082325, 0...","[0.013515202328562737, -0.00612090528011322, -...",Abandoned Camp Ruins
3,"[0.09302520006895065, -0.030586645007133484, -...","[0.06870320439338684, 0.04672708734869957, 0.0...",Abandoned Catchphrase
4,"[0.04039151221513748, 0.09288910031318665, 0.0...","[0.012362676672637463, 0.058168645948171616, 0...",Abandoned Hospital


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29903 entries, 0 to 29902
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   d_embedding  29903 non-null  object
 1   n_embedding  29903 non-null  object
 2   trope_name   29903 non-null  object
dtypes: object(3)
memory usage: 701.0+ KB


In [5]:
df['d_embedding'][0]

'[0.06963171064853668, 0.0030629639513790607, -0.008121015504002571, -0.013888058252632618, -0.01011512242257595, 0.014636923559010029, -0.01193075068295002, 0.029771361500024796, -0.01081668771803379, -0.06676766276359558, -0.010310154408216476, -0.02241273783147335, 0.05320066586136818, 0.024889009073376656, 0.03181518614292145, -0.08361653238534927, -0.0028990868013352156, 0.028244387358427048, -0.012618741020560265, -0.007032736670225859, -0.012423649430274963, 0.017461754381656647, 0.00025233018095605075, 0.047600358724594116, -0.010778825730085373, -0.03426583856344223, 0.02366022951900959, -0.035148538649082184, 0.0285659097135067, -0.020996391773223877, 0.03030324913561344, -0.030726205557584763, -0.002634217729791999, 0.02133430354297161, 2.1636437850247603e-06, -0.013711065985262394, 0.0159982331097126, 0.0012827098835259676, -0.0006692968308925629, -0.004904448986053467, 0.06002402305603027, -0.028040504083037376, 0.04168557748198509, -0.05156862735748291, 0.0366278998553752

In [None]:
# Making sure values are in expected shape
df['d_embedding'] = df['d_embedding'].apply(ast.literal_eval)
df['n_embedding'] = df['n_embedding'].apply(ast.literal_eval)

In [None]:
df['cosine_similarity'] = df.apply(
    lambda x: cosine_similarity(
        np.array(x['d_embedding']).reshape(1, -1),
        np.array(x['n_embedding']).reshape(1, -1)
    )[0][0],  # Extract the cosine similarity value
    axis=1
)

## Evaluating

In [None]:
df.head()

In [None]:
average = df['cosine_similarity'].mean()
print(f"The average cosine similarity is: {average}")

In [None]:
high_cosine_similarity = df[df['cosine_similarity'] > 0.5]
high_cosine_similarity.info()

In [None]:
percent_of_high_cosine_similarity = (high_cosine_similarity.shape[0] / df.shape[0]) * 100
print(f"The percentage of high cosine similarity is: {percent_of_high_cosine_similarity}%")

In [None]:
neagtive_cosine_similarity = df[df['cosine_similarity'] < 0]
neagtive_cosine_similarity.info()

## Graph Creation

In [None]:
df_sorted = df.sort_values(by='cosine_similarity', ascending=False)

# Creating a colored scatter plot
fig = px.scatter(
    df_sorted,
    x='trope_name',
    y='cosine_similarity',
    color='cosine_similarity',
    labels={'x': 'Trope', 'y': 'Cosine Similarity'},
    title='Cosine Similarity between Trope Names and Descriptions',
    color_continuous_scale='Viridis'
)

fig.update_layout(
    xaxis_title="Trope",
    yaxis_title="Cosine Similarity",
    yaxis=dict(range=[0, 1])
)

fig.show()

# Finding Cosine Simularity Name and Description of All Other Tropes
Each name has cosine simluarity done for every embedding of description in the dataset


In [None]:
df = pd.read_csv('embeddings.csv')

In [None]:
df['cosine_similarity'] = df['n_embedding'].apply(ast.literal_eval)
df['cosine_similarity'] = df['d_embedding'].apply(ast.literal_eval)

In [None]:
df['n_embedding'] = df['n_embedding'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df['d_embedding'] = df['d_embedding'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)


In [None]:
d_embedding_matrix = np.vstack(df['d_embedding'].values)
print(d_embedding_matrix.shape)
print(d_embedding_matrix)

In [None]:
def compute_cosine_similarity(n_embedding):

    n_embedding = np.array(n_embedding)
    # Reshape n_embedding for the function input
    n_embedding_reshaped = n_embedding.reshape(1, -1)
    # Compute cosine similarities
    cos_similarities = cosine_similarity(n_embedding_reshaped, d_embedding_matrix)
    return cos_similarities.flatten()

In [None]:
df['cosine_similarity'] = df['n_embedding'].apply(compute_cosine_similarity)

## Evaluating Results

In [None]:
df['average_cosine_similarity'] = df['cosine_similarity'].apply(lambda x: sum(x) / len(x))

In [None]:
df.head()

## Graph Visualization

## Finding Cosine Simularity Name and Other Names

In [None]:
df = pd.read_csv('embeddings.csv')

In [None]:
df['cosine_similarity'] = df['n_embedding'].apply(ast.literal_eval)
df['cosine_similarity'] = df['d_embedding'].apply(ast.literal_eval)

In [None]:
df['n_embedding'] = df['n_embedding'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df['d_embedding'] = df['d_embedding'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)


In [None]:
n_embedding_matrix = np.vstack(df['n_embedding'].values)
print(d_embedding_matrix.shape)
print(d_embedding_matrix)

In [None]:
def compute_cosine_similarity(n_embedding, matrix):

    n_embedding = np.array(n_embedding)
    # Reshape n_embedding for the function input
    n_embedding_reshaped = n_embedding.reshape(1, -1)
    # Compute cosine similarities
    cos_similarities = cosine_similarity(n_embedding_reshaped, matrix)
    return cos_similarities.flatten()

In [None]:
df['cosine_similarity'] = df['n_embedding'].apply(lambda x: compute_cosine_similarity(x, n_embedding_matrix))

In [None]:
df.head()

In [None]:
df['average_cosine_similarity'] = df['cosine_similarity'].apply(lambda x: sum(x) / len(x))

In [None]:
df.head()