<a href="https://www.kaggle.com/code/danielansted/biotech-clustering?scriptVersionId=183357119" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import warnings
from sklearn.metrics.pairwise import cosine_distances
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import gc
from datetime import datetime
from sklearn.preprocessing import MaxAbsScaler, StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import os
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD




warnings.filterwarnings('ignore')

# Import the Data
##### I know this dataset is in Kaggle, but I was responsible for the initial dataset collection as well. See https://github.com/Noob1701/Clin_Trials_Clustering for data collection notebook

In [2]:
import nltk
import subprocess

# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /kaggle/working/...
Archive:  /kaggle/working/corpora/wordnet.zip
   creating: /kaggle/working/corpora/wordnet/
  inflating: /kaggle/working/corpora/wordnet/lexnames  
  inflating: /kaggle/working/corpora/wordnet/data.verb  
  inflating: /kaggle/working/corpora/wordnet/index.adv  
  inflating: /kaggle/working/corpora/wordnet/adv.exc  
  inflating: /kaggle/working/corpora/wordnet/index.verb  
  inflating: /kaggle/working/corpora/wordnet/cntlist.rev  
  inflating: /kaggle/working/corpora/wordnet/data.adj  
  inflating: /kaggle/working/corpora/wordnet/index.adj  
  inflating: /kaggle/working/corpora/wordnet/LICENSE  
  inflating: /kaggle/working/corpora/wordnet/citation.bib  
  inflating: /kaggle/working/corpora/wordnet/noun.exc  
  inflating: /kaggle/working/corpora/wordnet/verb.exc  
  inflating: /kaggle/working/corpora/wordnet/README  
  inflating: /kaggle/working/corpora/wordnet/index.sense  
  inflating: /kaggle/working/corpora/wordnet/data.

In [3]:

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/clinicaltrials-gov-clinical-trials-dataset/clin_trials.csv


In [4]:
'''n = 496615 #number of records in file
s = 100000 #desired sample size

n = sum(1 for line in open('/kaggle/input/clinicaltrials-gov-clinical-trials-dataset/clin_trials.csv')) - 1 #number of records in file (excludes header)
s = 10000 #desired sample size
skip = sorted(random.sample(range(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list'''
clin_trials = pd.read_csv('/kaggle/input/clinicaltrials-gov-clinical-trials-dataset/clin_trials.csv')
clin_trials = clin_trials.drop('Unnamed: 0', axis = 1)
clin_trials_ro = clin_trials
clin_trials.head(1)

Unnamed: 0,Organization Full Name,Organization Class,Responsible Party,Brief Title,Full Title,Overall Status,Start Date,Standard Age,Conditions,Primary Purpose,Interventions,Intervention Description,Study Type,Phases,Outcome Measure,Medical Subject Headings
0,Montefiore Medical Center,OTHER,SPONSOR,Kinesiotape for Edema After Bilateral Total Kn...,"Effect of Kinesiotaping on Edema Management, P...",COMPLETED,2021-10-18,ADULT OLDER_ADULT,"Arthroplasty Complications, Arthroplasty, Repl...",TREATMENT,Kinesio(R)Tape for edema control,"Kinesio(R)Tape is an elastic, cotton tape with...",INTERVENTIONAL,,Change from baseline and during 1-2-day time i...,Edema


# Limiting Data to Sponsor and Industry

In [5]:
pd.set_option('display.max_columns', None)
print(len(clin_trials))
clin_trials = clin_trials[clin_trials['Responsible Party'] == 'SPONSOR']
clin_trials = clin_trials[clin_trials['Organization Class'] == 'INDUSTRY']
clin_trials = clin_trials.reset_index(drop = True)
print(len(clin_trials))
clin_trials.columns

496615
100864


Index(['Organization Full Name', 'Organization Class', 'Responsible Party',
       'Brief Title', 'Full Title', 'Overall Status', 'Start Date',
       'Standard Age', 'Conditions', 'Primary Purpose', 'Interventions',
       'Intervention Description', 'Study Type', 'Phases', 'Outcome Measure',
       'Medical Subject Headings'],
      dtype='object')

# Data Preprocessing

In [6]:
formats = [
    "%Y-%m-%d",             # 2023-06-01
    "%d/%m/%Y %H:%M:%S",    # 01/06/2023 14:45:00
    "%B %d, %Y",            # June 1, 2023
    "%Y.%m.%d AD at %H:%M:%S", # 2023.06.01 AD at 14:45:00
    "%d-%b-%Y",             # 01-Jun-2023
    "%Y/%m/%d %H:%M",       # 2023/06/01 14:45
    "%d/%m/%Y"              # 15/08/2023
]

In [7]:
def parse_mixed_date(date_str):
    for fmt in formats:
        try:
            return datetime.strptime(date_str, fmt)
        except ValueError:
            continue
    return pd.NaT

In [8]:
clin_trials.describe()

Unnamed: 0,Organization Full Name,Organization Class,Responsible Party,Brief Title,Full Title,Overall Status,Start Date,Standard Age,Conditions,Primary Purpose,Interventions,Intervention Description,Study Type,Phases,Outcome Measure,Medical Subject Headings
count,100864,100864,100864,100864,100864,100864,100864,100864,100864,100864,100864,100864,100864,84897,100864,100864
unique,11732,1,1,100346,98334,13,6635,6,32146,10,78488,79843,3,8,87496,12406
top,GlaxoSmithKline,INDUSTRY,SPONSOR,Sun Protection Factor Assay,Unknown,COMPLETED,Unknown,ADULT OLDER_ADULT,Healthy,TREATMENT,Unknown,Unknown,INTERVENTIONAL,PHASE1,Unknown,Unknown
freq,3374,100864,100864,14,1548,65002,809,70110,4052,66338,4828,9029,85847,22969,1176,17831


In [9]:
clin_trials['Medical Subject Headings'].value_counts()

Medical Subject Headings
Unknown                                                                                                                                                                                                  17831
Neoplasms                                                                                                                                                                                                 2197
Diabetes Mellitus Diabetes Mellitus, Type 2                                                                                                                                                               1886
Breast Neoplasms                                                                                                                                                                                          1322
Arthritis Arthritis, Rheumatoid                                                                                                                    

In [10]:
clin_trials['Start Date'] = clin_trials['Start Date'].apply(parse_mixed_date)
clin_trials['Year'] = clin_trials['Start Date'].dt.year


In [11]:
categorical_cols = ['Overall Status', 'Primary Purpose', 'Study Type', 'Standard Age', 'Phases', 'Year']

In [12]:
categorical_cols

['Overall Status',
 'Primary Purpose',
 'Study Type',
 'Standard Age',
 'Phases',
 'Year']

In [13]:
clin_trials = clin_trials.drop(['Responsible Party', 'Start Date', 'Organization Class', 'Medical Subject Headings'], axis = 1)
clin_trials.columns

Index(['Organization Full Name', 'Brief Title', 'Full Title', 'Overall Status',
       'Standard Age', 'Conditions', 'Primary Purpose', 'Interventions',
       'Intervention Description', 'Study Type', 'Phases', 'Outcome Measure',
       'Year'],
      dtype='object')

In [14]:
clin_trials['Year'].nunique()

40

In [15]:
clin_trials['Year'].isna().sum()

42688

# Encoding Relevant Data

In [16]:

encoder = OneHotEncoder(sparse=True)

# Select only categorical columns for encoding
categorical_data = clin_trials[categorical_cols]

# Encode categorical columns
encoded_trials = encoder.fit_transform(categorical_data)
encoded_df = pd.DataFrame.sparse.from_spmatrix(encoded_trials, columns=encoder.get_feature_names_out(categorical_cols))


In [17]:
clin_trials_encoded = clin_trials.drop(columns=categorical_cols)

# Combine with encoded DataFrame
clin_trials = pd.concat([clin_trials_encoded.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

In [18]:
clin_trials.columns

Index(['Organization Full Name', 'Brief Title', 'Full Title', 'Conditions',
       'Interventions', 'Intervention Description', 'Outcome Measure',
       'Overall Status_ACTIVE_NOT_RECRUITING',
       'Overall Status_APPROVED_FOR_MARKETING', 'Overall Status_AVAILABLE',
       'Overall Status_COMPLETED', 'Overall Status_ENROLLING_BY_INVITATION',
       'Overall Status_NOT_YET_RECRUITING',
       'Overall Status_NO_LONGER_AVAILABLE', 'Overall Status_RECRUITING',
       'Overall Status_SUSPENDED', 'Overall Status_TEMPORARILY_NOT_AVAILABLE',
       'Overall Status_TERMINATED', 'Overall Status_UNKNOWN',
       'Overall Status_WITHDRAWN', 'Primary Purpose_BASIC_SCIENCE',
       'Primary Purpose_DEVICE_FEASIBILITY', 'Primary Purpose_DIAGNOSTIC',
       'Primary Purpose_HEALTH_SERVICES_RESEARCH', 'Primary Purpose_OTHER',
       'Primary Purpose_PREVENTION', 'Primary Purpose_SCREENING',
       'Primary Purpose_SUPPORTIVE_CARE', 'Primary Purpose_TREATMENT',
       'Primary Purpose_Unknown', 'Stu

In [19]:
clin_trials['Organization Full Name'].value_counts()

Organization Full Name
GlaxoSmithKline               3374
Pfizer                        2772
Novartis                      2772
AstraZeneca                   2224
Boehringer Ingelheim          1914
                              ... 
Paragate Medical LTD             1
Kendal Nutricare Ltd             1
Oxford BioTherapeutics Ltd       1
Kinarus AG                       1
Crestone, Inc                    1
Name: count, Length: 11732, dtype: int64

# Calculating Studies per Company

In [20]:
entry_count_df = clin_trials['Organization Full Name'].value_counts().reset_index()
entry_count_df.columns = ['Organization Full Name', 'entry_count']

In [21]:
entry_count_df.head()

Unnamed: 0,Organization Full Name,entry_count
0,GlaxoSmithKline,3374
1,Pfizer,2772
2,Novartis,2772
3,AstraZeneca,2224
4,Boehringer Ingelheim,1914


In [22]:
clin_trials = clin_trials.merge(entry_count_df, on='Organization Full Name', how='left')

In [23]:
clin_trials.columns

Index(['Organization Full Name', 'Brief Title', 'Full Title', 'Conditions',
       'Interventions', 'Intervention Description', 'Outcome Measure',
       'Overall Status_ACTIVE_NOT_RECRUITING',
       'Overall Status_APPROVED_FOR_MARKETING', 'Overall Status_AVAILABLE',
       'Overall Status_COMPLETED', 'Overall Status_ENROLLING_BY_INVITATION',
       'Overall Status_NOT_YET_RECRUITING',
       'Overall Status_NO_LONGER_AVAILABLE', 'Overall Status_RECRUITING',
       'Overall Status_SUSPENDED', 'Overall Status_TEMPORARILY_NOT_AVAILABLE',
       'Overall Status_TERMINATED', 'Overall Status_UNKNOWN',
       'Overall Status_WITHDRAWN', 'Primary Purpose_BASIC_SCIENCE',
       'Primary Purpose_DEVICE_FEASIBILITY', 'Primary Purpose_DIAGNOSTIC',
       'Primary Purpose_HEALTH_SERVICES_RESEARCH', 'Primary Purpose_OTHER',
       'Primary Purpose_PREVENTION', 'Primary Purpose_SCREENING',
       'Primary Purpose_SUPPORTIVE_CARE', 'Primary Purpose_TREATMENT',
       'Primary Purpose_Unknown', 'Stu

# Combining titles and interventions 

In [24]:
combined_titles_cols = ['Brief Title', 'Full Title', 'Conditions']
combined_inter_cols = ['Interventions', 'Intervention Description']


In [25]:
def concat(columns):
    cols = [col for col in columns]
    clin_trials['combined_text'] = clin_trials[cols].agg(' '.join, axis=1)
    clin_trials.columns

In [26]:
concat(combined_titles_cols)

In [27]:
clin_trials['combined_titles'] = clin_trials['combined_text']
clin_trials = clin_trials.drop(columns = ['combined_text'])

In [28]:
concat(combined_inter_cols)


In [29]:
clin_trials['Intervention Desc'] = clin_trials['combined_text']
clin_trials = clin_trials.drop(columns = 'combined_text')


In [30]:
clin_trials = clin_trials.drop(columns = ['Brief Title', 'Full Title', 'Conditions','Interventions', 'Intervention Description'])
clin_trials.columns

Index(['Organization Full Name', 'Outcome Measure',
       'Overall Status_ACTIVE_NOT_RECRUITING',
       'Overall Status_APPROVED_FOR_MARKETING', 'Overall Status_AVAILABLE',
       'Overall Status_COMPLETED', 'Overall Status_ENROLLING_BY_INVITATION',
       'Overall Status_NOT_YET_RECRUITING',
       'Overall Status_NO_LONGER_AVAILABLE', 'Overall Status_RECRUITING',
       'Overall Status_SUSPENDED', 'Overall Status_TEMPORARILY_NOT_AVAILABLE',
       'Overall Status_TERMINATED', 'Overall Status_UNKNOWN',
       'Overall Status_WITHDRAWN', 'Primary Purpose_BASIC_SCIENCE',
       'Primary Purpose_DEVICE_FEASIBILITY', 'Primary Purpose_DIAGNOSTIC',
       'Primary Purpose_HEALTH_SERVICES_RESEARCH', 'Primary Purpose_OTHER',
       'Primary Purpose_PREVENTION', 'Primary Purpose_SCREENING',
       'Primary Purpose_SUPPORTIVE_CARE', 'Primary Purpose_TREATMENT',
       'Primary Purpose_Unknown', 'Study Type_EXPANDED_ACCESS',
       'Study Type_INTERVENTIONAL', 'Study Type_OBSERVATIONAL',
     

# Preprocessing Text for Vectorization

In [31]:
def preprocess_text(text):
    # Remove punctuation including '|'
    text = text.translate(str.maketrans('', '', string.punctuation + '|')).lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [32]:
clin_trials['combined_titles'] = clin_trials['combined_titles'].apply(preprocess_text)


In [33]:
clin_trials['Intervention Desc'] = clin_trials['Intervention Desc'].apply(preprocess_text)

In [34]:
clin_trials['Outcome Measure'] = clin_trials['Outcome Measure'].apply(preprocess_text)

In [35]:
vectorizer = TfidfVectorizer(min_df = .01, max_df = .9)

In [36]:
vec_title = vectorizer.fit_transform(clin_trials['combined_titles'])

In [37]:
vec_title_df = pd.DataFrame.sparse.from_spmatrix(vec_title, columns=vectorizer.get_feature_names_out())

In [38]:
vec_intervention = vectorizer.fit_transform(clin_trials['Intervention Desc'])

In [39]:
vec_intervention_df = pd.DataFrame.sparse.from_spmatrix(vec_intervention, columns = vectorizer.get_feature_names_out())

In [40]:
vec_outcome = vectorizer.fit_transform(clin_trials['Outcome Measure'])

In [41]:
vec_outcome_df = pd.DataFrame.sparse.from_spmatrix(vec_outcome, columns = vectorizer.get_feature_names_out())

# Scaling Data

In [42]:
non_sparse_cols = ['entry_count']
sparse_scaler = MaxAbsScaler()
title_sparse = sparse_scaler.fit_transform(vec_title_df)
intervention_sparse = sparse_scaler.fit_transform(vec_intervention_df)
outcome_sparse = sparse_scaler.fit_transform(vec_outcome_df)
non_sparse_scaler = StandardScaler()
scaled_non_sparse_data = non_sparse_scaler.fit_transform(clin_trials[non_sparse_cols])



In [43]:
title_sparse_df = pd.DataFrame.sparse.from_spmatrix(title_sparse, columns=vec_title_df.columns)
intervention_sparse_df = pd.DataFrame.sparse.from_spmatrix(intervention_sparse, columns=vec_intervention_df.columns)
outcome_sparse_df = pd.DataFrame.sparse.from_spmatrix(outcome_sparse, columns=vec_outcome_df.columns)
scaled_non_sparse_df = pd.DataFrame(scaled_non_sparse_data, columns=non_sparse_cols)


In [44]:
concatenated_columns = (
    list(clin_trials[['Organization Full Name']].columns) +
    list(title_sparse_df.columns) +  
    list(intervention_sparse_df.columns) + 
    list(outcome_sparse_df.columns) + 
    list(encoded_df.columns) +
    list(scaled_non_sparse_df.columns) # entry_count should be here
)

In [45]:
clin_trials = pd.concat([
    clin_trials[['Organization Full Name']].reset_index(drop=True),
    title_sparse_df.reset_index(drop=True),
    intervention_sparse_df.reset_index(drop=True),
    outcome_sparse_df.reset_index(drop=True),
    scaled_non_sparse_df.reset_index(drop=True),
    encoded_df.reset_index(drop=True),
], axis=1, ignore_index=True)
clin_trials.columns = concatenated_columns

# Resolving duplicate column names as a result of vectorization. 

In [46]:
def resolve_duplicate_columns(df):
    """
    Resolve duplicate column names in a DataFrame by appending _[i] to duplicate columns.

    Parameters:
    df (pandas.DataFrame): Input DataFrame.

    Returns:
    pandas.DataFrame: DataFrame with unique column names.
    """
    seen = {}
    new_columns = []
    for column in df.columns:
        if column in seen:
            seen[column] += 1
            new_columns.append(f"{column}_{seen[column]}")
        else:
            seen[column] = 1
            new_columns.append(column)
    df.columns = new_columns
    return df

In [47]:
clin_trials = resolve_duplicate_columns(clin_trials)

In [48]:
len(clin_trials.columns)

700

In [49]:
clin_trials_grouped = clin_trials.groupby('Organization Full Name').mean().reset_index()
    

# Separating out the 'Organization Full Name' for Dimensionality Reduction

In [50]:
X = clin_trials_grouped.drop(columns=['Organization Full Name'])
y = clin_trials_grouped['Organization Full Name']

In [51]:
n_components_list = [2, 5, 10, 20, 50, 100, 200, 300, 400]

In [52]:
explained_variance_results = {}

for n in n_components_list:
    # Apply TruncatedSVD
    svd = TruncatedSVD(n_components=n, random_state=42)
    X_reduced = svd.fit_transform(X)

    # Calculate explained variance ratio
    explained_variance = svd.explained_variance_ratio_
    total_explained_variance = explained_variance.sum()

    # Store the results
    explained_variance_results[n] = total_explained_variance

# Print the results
for n, variance in explained_variance_results.items():
    print(f"n_components={n}: Total explained variance={variance:.4f}")

n_components=2: Total explained variance=0.1375
n_components=5: Total explained variance=0.3081
n_components=10: Total explained variance=0.4262
n_components=20: Total explained variance=0.5384
n_components=50: Total explained variance=0.6667
n_components=100: Total explained variance=0.7547
n_components=200: Total explained variance=0.8552
n_components=300: Total explained variance=0.9140
n_components=400: Total explained variance=0.9523


# Using 400 components to get 95% of the variation. 

In [53]:
calc_svd = TruncatedSVD(n_components = 400, random_state=42)
calc_X = calc_svd.fit_transform(X)
clin_trials_calc = pd.concat([y.reset_index(drop = True), pd.DataFrame(calc_X)], axis = 1)

# Function to Find Companies Similar To Target

In [54]:
def find_closest_organizations(organization_name, df = clin_trials_calc, n=5, prnt = False):
    """
    Find the n closest organization names to the given organization name.

    Parameters:
    organization_name (str): The organization name to search for.
    n (int): Number of closest organization names to return.

    Returns:
    list: List of the n closest organization names.
    """
    # Find the index of the organization name
    matches = df['Organization Full Name'].str.contains(organization_name, na=False)

    if matches.sum() == 0:
        print('ERROR: Organization Not Found in Dataset')
    elif matches.sum() > 1:
        print('ERROR: Query Results in Multiple Organizations')
    else:
        idx = matches.idxmax()
    # Retrieve the corresponding reduced feature vector
    target_vector = df.iloc[idx, 1:].values  # Assuming feature vectors start from the second column

    # Calculate cosine distances between the target vector and all other vectors
    distances = cosine_distances([target_vector], df.iloc[:, 1:].values)  # Exclude the organization name column

    # Get the indices of the n smallest distances
    closest_indices = np.argsort(distances[0])[:n+1]
    
    closest_indices = closest_indices[closest_indices != idx]

    # Return the closest organization names
    closest_orgs = df.iloc[closest_indices, 0].tolist()  # Assuming organization names are in the first column
    if prnt:
        for number, org in enumerate(closest_orgs, start=1):
            print(f"{number}: {org}")
    else:
        return closest_orgs

In [55]:
# In my opinion the below worked better than a google search. At least for Sarpeta (chosen for my knowledge of the company)

# Simply searching for Duchnenne Muscular Dystrophy on Google, which is a specialization of Sarepta, I could not quickly find NS Pharma, 
# which also specializes in Duchnenne Muscular Dystrophy. 

# All of the top five except Vertex focuses (or focused) solely on rare diseases, 
#and even Vertex does significant research in the area as well as genetic research. 
#All of the top 10 were either focused on rare diseases or CNS diseases. 


## Find the closest organizations to your target here

In [56]:
find_closest_organizations('TriReme', n = 10, prnt = True)

1: Terumo Europe N.V.
2: Biotronik AG
3: C. R. Bard
4: Cook Group Incorporated
5: Carl Zeiss Meditec, Inc.
6: Merit Medical Systems, Inc.
7: Medtronic - MITG
8: W.L.Gore & Associates
9: Abbott Medical Devices
10: Biotronik, Inc.


In [57]:
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(X)
tsne_results_df = pd.DataFrame(tsne_results, columns=['t-SNE Component 1', 't-SNE Component 2'])

# Add 'Organization Full Name' column to tsne_results_df
tsne_results_df['Organization Full Name'] = clin_trials_calc['Organization Full Name']

In [58]:
def plot_closest_organizations(target_org, n_closest=25):
    """
    Plot the closest organizations to a target organization based on t-SNE results.

    Parameters:
    - tsne_results_df (pandas.DataFrame): DataFrame containing t-SNE results.
    - target_org (str): The target organization.
    - n_closest (int): Number of closest organizations to visualize. Default is 20.
    """
    # Find the closest organizations to the target organization
    closest_orgs = find_closest_organizations(target_org, n=n_closest)
    # Create an empty figure
    fig = go.Figure()

    # Iterate through the closest organizations and add them to the plot
    for org in closest_orgs:
        closest_org_df = tsne_results_df[tsne_results_df['Organization Full Name'] == org]
        fig.add_trace(go.Scatter(
            x=closest_org_df['t-SNE Component 1'],
            y=closest_org_df['t-SNE Component 2'],
            mode='markers',
            marker=dict(size=8),
            name=org,
        ))

    # Add the target organization with a different marker
    target_org_df = tsne_results_df[tsne_results_df['Organization Full Name'].str.contains(target_org, case=False)]
    fig.add_trace(go.Scatter(
        x=target_org_df['t-SNE Component 1'],
        y=target_org_df['t-SNE Component 2'],
        mode='markers',
        marker=dict(color='blue', size=12, symbol='x'),
        name=target_org,
    ))

    # Update layout
    fig.update_layout(
        title=f'Closest Organizations to {target_org}',
        xaxis_title='t-SNE Component 1',
        yaxis_title='t-SNE Component 2',
    )

    # Show the plot
    fig.show()

# Graphing Based on Company
#### Note: this will not be as accurate as the function that simply returns a list in terms of closest to the target. But it might be of interest to see graphical.
#### Other Note: The Legend is kept, it is an accurate representation of the order of closeness as determined by the find_closest_organization function

In [59]:
plot_closest_organizations('TriReme')

In [60]:
tsne_results_df[tsne_results_df['Organization Full Name'].str.contains('Sur')]

Unnamed: 0,t-SNE Component 1,t-SNE Component 2,Organization Full Name
10,70.153633,-14.974838,1st SurgiConcept
16,15.907576,18.669657,270Surgical
55,20.694832,4.056235,7D Surgical Inc.
164,2.644733,-21.640793,ARKSurgical
280,23.534355,4.067831,"Acera Surgical, Inc."
...,...,...,...
10133,6.112061,29.190468,Surgimab
10362,21.935312,-14.464003,Talon Surgical
10443,67.536964,-5.913893,Teleon Surgical B.V.
10584,14.698752,7.275657,Think Surgical Inc.
