In [32]:
!ls ~/dev/ai/Novia/lib/serach-comparison/refactored/mock_transcripts

interview_0.txt  interview_13.txt interview_18.txt interview_5.txt
interview_1.txt  interview_14.txt interview_19.txt interview_6.txt
interview_10.txt interview_15.txt interview_2.txt  interview_7.txt
interview_11.txt interview_16.txt interview_3.txt  interview_8.txt
interview_12.txt interview_17.txt interview_4.txt  interview_9.txt


In [38]:
import openai  # for generating embeddings
import os  # for environment variables
import pandas as pd  # for DataFrames to store article sections and embeddings
import re  # for cutting <ref> links out of Wikipedia articles
from dotenv import load_dotenv  # for loading environment variables from a .env file
load_dotenv()  # load environment variables from .env file

client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


In [33]:
import os
import pandas as pd

# Define the path to the directory containing the transcript files
transcript_dir = os.path.expanduser("~/dev/ai/Novia/lib/serach-comparison/refactored/mock_transcripts")

# Initialize an empty list to store the data
data = []

# Loop over the txt files in the directory
for filename in os.listdir(transcript_dir):
    if filename.endswith(".txt"):
        interview_name = filename.replace(".txt", "")
        with open(os.path.join(transcript_dir, filename), 'r') as file:
            lines = file.readlines()
            order_in_interview = 1
            for line in lines:
                if ": " in line:
                    person, text = line.split(": ", 1)
                    data.append([interview_name, person, text.strip(), order_in_interview])
                    order_in_interview += 1

# Create a DataFrame from the collected data
df = pd.DataFrame(data, columns=["interview_name", "person", "text", "order_in_interview"])

# Display the DataFrame
df.head()



Unnamed: 0,interview_name,person,text,order_in_interview
0,interview_18,Ava Silverstone,"Today, one of our key topics revolves around i...",1
1,interview_18,Benjamin Knox,"Absolutely, Ava. And, from an environmental st...",2
2,interview_18,Ava Silverstone,"Exactly, Benjamin. Incorporating adaptive meth...",3
3,interview_18,Benjamin Knox,"On that note, considering the advances in data...",4
4,interview_18,Ava Silverstone,That's a great point. By integrating machine l...,5


In [35]:
# Sort the DataFrame by 'interview_name' and 'order_in_interview'
df = df.sort_values(by=['interview_name', 'order_in_interview'])

# Display the sorted DataFrame
df


Unnamed: 0,interview_name,person,text,order_in_interview
196,interview_0,Samira Joshi,The first step in our digital twin technology ...,1
197,interview_0,Daniel Reed,"Absolutely, Samira. From a manufacturing persp...",2
198,interview_0,Anna Liu,One challenge we face in the research area is ...,3
199,interview_0,Erik Gomez,"That's an interesting point, Anna. From the en...",4
200,interview_0,Christine Bailey,And I think it's crucial we don't overlook the...,5
...,...,...,...,...
934,interview_9,Jessica Li,"Also, from a data interaction and management s...",75
935,interview_9,Mark Wright,And let’s not overlook the environmental impac...,76
936,interview_9,Elena Mirov,I think a continued focus on advancing our dig...,77
937,interview_9,Thomas Connery,It also positions us well for partnerships and...,78


In [48]:

EMBEDDING_MODEL = "text-embedding-3-small"
BATCH_SIZE = 1000  # you can submit up to 2048 embedding inputs per request

# Initialize an empty list to store the embeddings
embeddings = []

# Loop over the text data in batches
for batch_start in range(0, len(df['text']), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    batch = df['text'][batch_start:batch_end].tolist()
    print(f"Batch {batch_start} to {batch_end-1}")
    response = client.embeddings.create(model=EMBEDDING_MODEL, input=batch)
    for i, be in enumerate(response.data):
        assert i == be.index  # double check embeddings are in same order as input
    batch_embeddings = [e.embedding for e in response.data]
    embeddings.extend(batch_embeddings)

# Assign the embeddings to the DataFrame
df = df.assign(embedding=embeddings)

# Display the DataFrame with embeddings
df.head()


Batch 0 to 999
Batch 1000 to 1999


Unnamed: 0,interview_name,person,text,order_in_interview,embedding
196,interview_0,Samira Joshi,The first step in our digital twin technology ...,1,"[-0.023288175463676453, 0.01607142761349678, 0..."
197,interview_0,Daniel Reed,"Absolutely, Samira. From a manufacturing persp...",2,"[0.016718469560146332, 0.042272020131349564, 0..."
198,interview_0,Anna Liu,One challenge we face in the research area is ...,3,"[0.014805459417402744, 0.005351570434868336, 0..."
199,interview_0,Erik Gomez,"That's an interesting point, Anna. From the en...",4,"[-0.008645583875477314, 0.04594981297850609, 0..."
200,interview_0,Christine Bailey,And I think it's crucial we don't overlook the...,5,"[0.053517550230026245, 0.03046734631061554, 0...."


In [1]:
df[0]

NameError: name 'df' is not defined

In [49]:
# Save the DataFrame to a CSV file in the data folder
df.to_csv('data/interviews_with_embeddings.csv', index=False)


In [50]:
import ast

# Load data/papers_with_umap.csv into a DataFrame called papers
papers = pd.read_csv('data/papers_with_umap.csv')
papers['embedding'] = papers['embedding'].apply(ast.literal_eval)

# Display the first few rows of the DataFrame
papers.head()


Unnamed: 0,title,citations,year,authors,journal,abstract,keywords,score,doi,text,...,labels_18,labels_19,labels_20,labels_21,labels_22,labels_23,labels_24,labels_25,umap_x,umap_y
0,Maritime Digital Twin architecture: A concept ...,26.0,2021.0,"Jan-Erik Giering, Alexander Dyck",Automatisierungstechnik,Digital Twins (DTs) play an important role in ...,Maritime Digital Twin Architecture; shipping; ...,-1,,Maritime Digital Twin architecture: A concept ...,...,9,9,9,7,7,7,7,7,-1.253527,-5.120859
1,Real-time digital twin for ship operation in w...,19.0,2022.0,"Jae-Hoon Lee, Yoon-Seo Nam, Yonghwan Kim, Yumi...",Ocean Engineering,This paper introduces a real-time digital twin...,Digital twin; Ship operation in waves; Real-ti...,90,,Real-time digital twin for ship operation in w...,...,9,9,9,7,7,7,7,7,0.786554,-5.820701
2,Real-time digital twin of research vessel for ...,16.0,2021.0,"Major, Pierre Yann; Li, Guoyuan; Zhang, Houxia...",Proceedings of 35th European Council for Model...,Real-time digital twins of ships in operation ...,Virtual Prototyping; Digital Twin; Remote Moni...,90,,Real-time digital twin of research vessel for ...,...,9,9,9,7,7,7,7,7,-0.32192,-4.949813
3,Digital Twin in the Maritime Domain: A Review ...,6.0,2023.0,"Nuwan Sri Madusanka,Yijie Fan,Shaolong Yang an...",J. Mar. Sci. Eng.,This paper highlights the development of Digit...,digital twin; digitalization; smart shipping; ...,80,,Digital Twin in the Maritime Domain: A Review ...,...,9,9,9,7,7,7,7,7,-1.241211,-5.13723
4,From digital twin to maritime data space: Tran...,16.0,,"Ørnulf Jan Rødseth, Arne J. Berre",Proc. 13th Int. Symp. Integr. Ship’s Inf. Syst...,The concept of the digital twin is gaining mom...,,-1,,From digital twin to maritime data space: Tran...,...,9,9,9,7,7,7,7,7,-1.226664,-5.342246


In [55]:
from sklearn.metrics.pairwise import cosine_similarity

def render_interview_with_references(df, papers):
    # Initialize an empty list to store the interview segments with references
    interview_with_references = []

    # Loop over each interview segment
    for index, row in df.iterrows():
        segment_text = row['text']
        segment_embedding = row['embedding']

        # Calculate cosine similarity between the segment embedding and all paper embeddings
        similarities = cosine_similarity([segment_embedding], papers['embedding'].tolist())[0]

        # Find the indices of the top 3 most similar papers
        top_indices = similarities.argsort()[-3:][::-1]

        # Get the titles and similarity scores of the top 3 most similar papers
        top_papers = papers.iloc[top_indices]['title'].tolist()
        top_scores = similarities[top_indices].tolist()

        # Append each top paper as a separate row with the segment text
        for paper_title, score in zip(top_papers, top_scores):
            interview_with_references.append({
                'interview_name': row['interview_name'],
                'person': row['person'],
                'text': segment_text,
                'order_in_interview': row['order_in_interview'],
                'paper_title': paper_title,
                'distance': score
            })

    # Convert the list to a DataFrame
    return pd.DataFrame(interview_with_references)

# Example usage
interview_with_references_df = render_interview_with_references(df, papers)

# Display the DataFrame
print(interview_with_references_df.head())

  interview_name        person  \
0    interview_0  Samira Joshi   
1    interview_0  Samira Joshi   
2    interview_0  Samira Joshi   
3    interview_0   Daniel Reed   
4    interview_0   Daniel Reed   

                                                text  order_in_interview  \
0  The first step in our digital twin technology ...                   1   
1  The first step in our digital twin technology ...                   1   
2  The first step in our digital twin technology ...                   1   
3  Absolutely, Samira. From a manufacturing persp...                   2   
4  Absolutely, Samira. From a manufacturing persp...                   2   

                                         paper_title  distance  
0  Data Driven Digital Twins for the Maritime Domain  0.749131  
1      Predictive Digital Twins for Autonomous Ships  0.730036  
2      Digital Twin for Firefighting System on Ships  0.727889  
3  Digital Twin for Structural Monitoring and Pre...  0.551318  
4       Using

In [56]:
# Display the DataFrame
interview_with_references_df.head()

Unnamed: 0,interview_name,person,text,order_in_interview,paper_title,distance
0,interview_0,Samira Joshi,The first step in our digital twin technology ...,1,Data Driven Digital Twins for the Maritime Domain,0.749131
1,interview_0,Samira Joshi,The first step in our digital twin technology ...,1,Predictive Digital Twins for Autonomous Ships,0.730036
2,interview_0,Samira Joshi,The first step in our digital twin technology ...,1,Digital Twin for Firefighting System on Ships,0.727889
3,interview_0,Daniel Reed,"Absolutely, Samira. From a manufacturing persp...",2,Digital Twin for Structural Monitoring and Pre...,0.551318
4,interview_0,Daniel Reed,"Absolutely, Samira. From a manufacturing persp...",2,Using Digital Twin in a Shipbuilding Project,0.541175


In [70]:
from IPython.display import display, HTML
import pandas as pd

def list_interviews_as_html(df, papers, interview_name, threshold=0.7):
    html_output = f"""
    <style>
        .container {{
            display: flex;
            flex-direction: row;
        }}
        .interview-column {{
            width: 50%;
            padding: 10px;
            border-right: 1px solid #ccc;
        }}
        .preview-column {{
            width: 50%;
            padding: 10px;
            position: -webkit-sticky; /* For Safari */
            position: sticky;
            top: 0;
            height: 100vh;
            overflow-y: auto;
        }}
        .tooltip {{
            color: blue;
            text-decoration: underline;
            cursor: pointer;
        }}
        .highlight {{
            background-color: #f0f0f0;
        }}
        .active-link {{
            background-color: blue;
            color: white;
        }}
    </style>
    <h1>{interview_name}</h1>
    <div class="container">
        <div class="interview-column">
    """
    
    # Filter the DataFrame by the specified interview and threshold
    filtered_df = df[(df['interview_name'] == interview_name) & (df['distance'] > threshold)]
    
    # Sort by 'order_in_interview' and 'paper_title'
    sorted_df = filtered_df.sort_values(by=['order_in_interview', 'paper_title'])
    
    # Group by the unique segments to consolidate paper titles
    grouped = sorted_df.groupby(['person', 'text', 'order_in_interview'])

    for _, group in grouped:
        person = group['person'].iloc[0]
        text = group['text'].iloc[0]
        
        # Collect the paper titles, abstracts, and DOIs
        relevant_papers = group['paper_title'].tolist()
        relevant_abstracts = [papers.loc[papers['title'] == title, 'abstract'].values[0] if isinstance(papers.loc[papers['title'] == title, 'abstract'].values[0], str) else "" for title in relevant_papers]
        relevant_dois = [papers.loc[papers['title'] == title, 'doi'].values[0] for title in relevant_papers]
        
        # Format the paper titles with clickable links
        papers_str = ""
        for title, abstract, doi in zip(relevant_papers, relevant_abstracts, relevant_dois):
            safe_abstract = abstract.replace("'", "\\'")
            papers_str += f"""<a class="tooltip" onclick="showPreview('{title}', '{safe_abstract}', '{doi}')">{title}</a>, """
        papers_str = f" <small>[{papers_str[:-2]}]</small>" if relevant_papers else ""
        
        # Add the formatted string to the HTML output
        html_output += f"<p><b>{person}:</b> {text}{papers_str}</p>\n"
    
    html_output += """
        </div>
        <div class="preview-column" id="preview">
            <h1>Paper Title</h1>
            <p>Abstract text will appear here when you click a paper title.</p>
        </div>
    </div>
    <script>
        function showPreview(title, abstract, doi) {
            // Update the preview content
            document.getElementById('preview').innerHTML = '<h1>' + title + '</h1><p>' + abstract + '</p><a href="https://doi.org/' + doi + '" target="_blank">Read full paper</a>';

            // Highlight the clicked link
            var links = document.querySelectorAll('.tooltip');
            links.forEach(link => link.classList.remove('active-link'));
            event.target.classList.add('active-link');

            // Highlight the sentence
            var paragraphs = document.querySelectorAll('.interview-column p');
            paragraphs.forEach(paragraph => paragraph.classList.remove('highlight'));
            event.target.closest('p').classList.add('highlight');
        }
    </script>
    """
    
    return html_output

# Example usage
interview_name = "interview_0"  # Specify the interview you want to render
html_result = list_interviews_as_html(interview_with_references_df, papers, interview_name)

# Display the HTML in the Jupyter notebook
display(HTML(html_result))

In [71]:
import os

# Ensure the output directory exists
output_dir = "docs/interviews"
os.makedirs(output_dir, exist_ok=True)

# Generate HTML for each interview and save to file
interview_names = interview_with_references_df['interview_name'].unique()
for interview_name in interview_names:
    html_result = list_interviews_as_html(interview_with_references_df, papers, interview_name)
    output_path = os.path.join(output_dir, f"{interview_name}.html")
    with open(output_path, "w") as file:
        file.write(html_result)
