In [None]:
import os
import PyPDF2
import re
import tqdm

# Function to extract text from a single PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() if page.extract_text() else ''
    return text

# Function to extract the "Methods" section from the extracted text
def extract_methods_section(text):
    pattern = r"(Methods|Materials and Methods)(.*?)(?=\n[A-Z]{2,})"
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(0)  # Returns the whole "Methods" section
    else:
        return "Methods section not found."

# Function to process all PDFs in a folder and extract their Methods sections
def process_pdf_files(folder_path):
    pdf_text_dict = {}
    for filename in tqdm.tqdm(os.listdir(folder_path)):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            pdf_text = extract_text_from_pdf(pdf_path)
            pdf_text_dict[filename] = pdf_text
    return pdf_text_dict

# Example usage
folder_path = '/home/ubuntu/bascvi/data/scrna_papers/pdf'  # Replace with your actual folder path
all_text = process_pdf_files(folder_path)

import pickle

# save all methods
with open("/home/ubuntu/bascvi/data/scrna_papers/text_dict.pickle", "wb") as f:
    pickle.dump(all_text, f)

# get embeddings

In [None]:
import pickle 
with open("/home/ubuntu/bascvi/data/scrna_papers/text_dict.pickle", "rb") as f:
    all_text = pickle.load(f)

In [None]:
from openai import OpenAI
client = OpenAI(api_key=)



In [None]:
import pandas as pd

# Convert the dictionary to a DataFrame (keys are filenames and values are text)
all_text_df = pd.DataFrame(list(all_text.items()), columns=['filename', 'text'])
all_text_df

In [None]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   # shorten to 8k tokens
   text = text[:8000 * 3]

   try:
      emb = client.embeddings.create(input = [text], model=model).data[0].embedding
   except Exception as e:
      print(e)
      emb = None

   return emb

all_text_df['large_embedding'] = all_text_df['text'].apply(lambda x: get_embedding(x, model='text-embedding-3-large'))
all_text_df

# save
with open("/home/ubuntu/bascvi/data/scrna_papers/text_df_embeddings.pickle", "wb") as f:
    pickle.dump(all_text_df, f)

In [None]:
# read in csv
import pandas as pd
import numpy as np

metadata_df = pd.read_csv('/home/ubuntu/bascvi/data/phenomic_scrna_studies.csv')

# join on filename
all_text_df['filename'] = all_text_df['filename'].apply(lambda x: x.replace('.pdf', ''))
all_text_df['filename'] = all_text_df['filename'].apply(lambda x: x.replace(' ', '_'))

all_text_df = all_text_df.merge(metadata_df, left_on='filename', right_on='unique_id', how='inner')
all_text_df

In [None]:
import umap

import matplotlib.pyplot as plt

all_text_emb = all_text_df[all_text_df['large_embedding'].notnull()]['large_embedding'].tolist()

# Run UMAP on the embeddings
umap_embeddings = umap.UMAP(n_neighbors=5, min_dist=0.3).fit_transform(all_text_emb)

# Plot the UMAP embeddings
plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1])
plt.title('UMAP Embeddings')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.show()

In [None]:
import plotly.express as px

# Create a DataFrame with UMAP embeddings and journal information
umap_df = pd.DataFrame(umap_embeddings, columns=['UMAP Dimension 1', 'UMAP Dimension 2'])
umap_df['color'] = all_text_df['scrna_seq_technology']

# Plot the UMAP embeddings with color by journal
fig = px.scatter(umap_df, x='UMAP Dimension 1', y='UMAP Dimension 2', color='color')
fig.update_layout(title='UMAP Embeddings', xaxis_title='UMAP Dimension 1', yaxis_title='UMAP Dimension 2')
fig.show()