# 0- `Config`

In [1]:
# Path Management
import pickle
import os
# Data Handling 
import pandas as pd
import numpy as np
# Transformers 
from transformers import GPT2Model, GPT2Tokenizer
# Tokenization
import torch
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
try:
    if manager == 1:
        print("Jupyter has been running from Manager") 
except:
    temp  = "temp"
    %run s0_config.ipynb 

# 1- `Load`

In [3]:
import_name = "subject_data"
with open(os.path.join(access_data_path(f"{process_barcode}/data/{folder_setup}", f"{import_name}" + ".pkl")) , 'rb') as file:
    subject_data = pickle.load(file)   

In [4]:
subject_data

Unnamed: 0,Cell_Type,Gene_Marker,Expression,P Value,P Value(Adj),Score,Expression_Gene
0,hematopoietic stem cell,PRSS57,1.8250,3.733220e-99,3.934440e-95,21.135736,"[1.825013518333435, 'PRSS57']"
1,hematopoietic stem cell,ENSG00000175061,2.7580,3.352268e-93,1.766478e-89,20.478439,"[2.757690668106079, 'ENSG00000175061']"
2,hematopoietic stem cell,SPINK2,1.7420,5.660589e-89,2.386278e-85,19.998631,"[1.7422393560409546, 'SPINK2']"
3,hematopoietic stem cell,SOX4,1.7340,3.750766e-80,7.187150e-77,18.958588,"[1.7343593835830688, 'SOX4']"
4,hematopoietic stem cell,SMIM24,1.4780,1.085542e-78,1.906754e-75,18.780741,"[1.4784719944000244, 'SMIM24']"
...,...,...,...,...,...,...,...
51,double negative T regulatory cell,"CD3D,CD3E,CST7,GZMA,NKG7",2.3152,1.470268e-231,5.165050e-228,35.086993,
52,double negative T regulatory cell,"CD3D,CD3E,CST7,CTSW,NKG7",2.3286,1.470268e-231,5.165050e-228,34.929794,
53,double negative T regulatory cell,"CD3D,CD3E,GZMA,CTSW,NKG7",2.3194,1.470268e-231,5.165050e-228,34.775430,
54,double negative T regulatory cell,"CD3D,CST7,GZMA,CTSW,NKG7",2.2566,1.470268e-231,5.165050e-228,34.478230,


In [5]:
#subject_data = subject_data.iloc[:100]

In [6]:
all_in_one = pd.DataFrame() 
all_in_one["Cell_Type"]        = subject_data["Cell_Type"]
all_in_one["Gene_Marker"]      = subject_data["Gene_Marker"]
all_in_one["Expression"]       = subject_data["Expression"]
all_in_one["Expression_Gene"]  = subject_data["Expression_Gene"]  

# 2.A- `Operation` | BERT

In [7]:
from transformers import BertModel, BertTokenizer

### 1-) `Context Construction`

In [8]:
texts = subject_data['Cell_Type'] + " " + subject_data['Gene_Marker']
texts = texts.astype(str)
texts 

0                        hematopoietic stem cell PRSS57
1               hematopoietic stem cell ENSG00000175061
2                        hematopoietic stem cell SPINK2
3                          hematopoietic stem cell SOX4
4                        hematopoietic stem cell SMIM24
                            ...                        
51    double negative T regulatory cell CD3D,CD3E,CS...
52    double negative T regulatory cell CD3D,CD3E,CS...
53    double negative T regulatory cell CD3D,CD3E,GZ...
54    double negative T regulatory cell CD3D,CST7,GZ...
55    double negative T regulatory cell CD3E,CST7,GZ...
Length: 2652, dtype: object

### 2-) `Transformer Selection`

In [9]:
# Step 1: Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Step 2: Tokenization 
tokenized = texts.apply(lambda x: tokenizer(x, padding='max_length', truncation=True, max_length=512, return_tensors="pt")) 
# Step 3: Model Selection  
model = BertModel.from_pretrained('bert-base-uncased')

### 3-) `Tokenization`

In [10]:

batch_size = 32
Combined_Text_Embeddings = []

# Assuming model is already loaded and set to evaluation mode with model.eval()

for i in tqdm(range(0, len(tokenized), batch_size)):
    # Here, make sure you're extracting batches correctly
    batch = tokenized[i:i+batch_size]
    
    # Now, prepare your inputs. This assumes each entry in `batch` is a dictionary
    # returned by the tokenizer corresponding to a single example.
    batch_input_ids = torch.stack([item['input_ids'] for item in batch])
    batch_attention_mask = torch.stack([item['attention_mask'] for item in batch])
    
    # Ensure dimensions are correct: [batch_size, sequence_length]
    # If you encounter an error here, check the structure and content of `batch`
    inputs = {
        'input_ids': batch_input_ids.squeeze(),  # Remove any extra dimensions
        'attention_mask': batch_attention_mask.squeeze()  # Remove any extra dimensions
    }
    
    with torch.no_grad():
        outputs = model(**inputs)
        batch_embeddings = outputs.pooler_output
        Combined_Text_Embeddings.append(batch_embeddings)

# Combine embeddings from all batches
Embeddings_BERT = torch.cat(Combined_Text_Embeddings, dim=0)

#Embeddings_BERT = 1 

100%|██████████| 83/83 [04:42<00:00,  3.41s/it]


In [11]:
Embeddings_BERT.shape

torch.Size([2652, 768])

### 4-) `Main Data Creation`

In [12]:
#data_full["Cell_Type_Embeddings"] = Embeddings_BERT  
all_in_one["Gene_Marker_BERT"] = [embedding.tolist() for embedding in Embeddings_BERT] 

In [13]:
#embeddings_list = data_full["Cell_Type_Embeddings"].tolist()  # Get the column as a list of lists
#embeddings_tensor = torch.tensor(embeddings_list)             # Convert to a tensor 

# 2.B- `Operation` | GPT2 

In [14]:
from transformers import GPT2Model, GPT2Tokenizer

### 1-) `Context Construction`

In [15]:
texts = subject_data['Cell_Type'] + " " + subject_data['Gene_Marker']
texts = texts.astype(str)
texts 

0                        hematopoietic stem cell PRSS57
1               hematopoietic stem cell ENSG00000175061
2                        hematopoietic stem cell SPINK2
3                          hematopoietic stem cell SOX4
4                        hematopoietic stem cell SMIM24
                            ...                        
51    double negative T regulatory cell CD3D,CD3E,CS...
52    double negative T regulatory cell CD3D,CD3E,CS...
53    double negative T regulatory cell CD3D,CD3E,GZ...
54    double negative T regulatory cell CD3D,CST7,GZ...
55    double negative T regulatory cell CD3E,CST7,GZ...
Length: 2652, dtype: object

### 2-) `Transformer Selection`

In [16]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token 

model = GPT2Model.from_pretrained('gpt2') 

### 3-) `Tokenization`

In [17]:
batch_size = 32
Combined_Text_Embeddings = []

for i in tqdm(range(0, len(texts), batch_size)):
    batch_texts = texts[i:i+batch_size].tolist()
    inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)

    with torch.no_grad():
        outputs = model(**inputs)
        # Use the mean of the last hidden state as embedding representation
        embeddings = outputs.last_hidden_state.mean(dim=1)
        Combined_Text_Embeddings.append(embeddings)

# Combine all batch embeddings into a single tensor
Embeddings_GPT = torch.cat(Combined_Text_Embeddings, dim=0)

100%|██████████| 83/83 [00:17<00:00,  4.68it/s]


In [18]:
Embeddings_GPT.shape

torch.Size([2652, 768])

### 4-) `Main Data Creation`

In [19]:
# Step 1 
#data_full = subject_data.copy() 
# Step 2: 
#data_full["Cell_Type_Embeddings"] = Embeddings_BERT  
all_in_one["Gene_Marker_GPT2"] = [embedding.tolist() for embedding in Embeddings_GPT]

# 2.C- `Operation` | GPT3 

In [20]:
import openai 
from openai.embeddings_utils import get_embeddings  
openai.api_key = OPENAI_API_KEY 

### 1-) `Context Construction`

In [21]:
texts = subject_data['Cell_Type'] + " " + subject_data['Gene_Marker']
texts = texts.astype(str)
texts 

0                        hematopoietic stem cell PRSS57
1               hematopoietic stem cell ENSG00000175061
2                        hematopoietic stem cell SPINK2
3                          hematopoietic stem cell SOX4
4                        hematopoietic stem cell SMIM24
                            ...                        
51    double negative T regulatory cell CD3D,CD3E,CS...
52    double negative T regulatory cell CD3D,CD3E,CS...
53    double negative T regulatory cell CD3D,CD3E,GZ...
54    double negative T regulatory cell CD3D,CST7,GZ...
55    double negative T regulatory cell CD3E,CST7,GZ...
Length: 2652, dtype: object

### 2-) `Transformer Selection`

In [22]:
"""
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token 

model = GPT2Model.from_pretrained('gpt2') 
"""

"\ntokenizer = GPT2Tokenizer.from_pretrained('gpt2')\ntokenizer.pad_token = tokenizer.eos_token \n\nmodel = GPT2Model.from_pretrained('gpt2') \n"

### 3-) `Tokenization`

In [23]:
# Placeholder for embeddings
embeddings_list = []

for text in tqdm(texts):
    response = openai.Embedding.create(
      input=text,
      engine="text-embedding-ada-002"  # or another suitable GPT-3 model
    )
    embedding = response['data'][0]['embedding']
    embeddings_list.append(embedding)

# Convert list of embeddings to a suitable format (e.g., numpy array)
Embeddings_GPT = np.array(embeddings_list)

100%|██████████| 2652/2652 [17:13<00:00,  2.57it/s]


In [24]:
Embeddings_GPT.shape

(2652, 1536)

### 4-) `Main Data Creation`

In [25]:
# Step 1 
#data_full = subject_data.copy() 
# Step 2: 
#data_full["Cell_Type_Embeddings"] = Embeddings_BERT  
all_in_one["Gene_Marker_GPT3"] = [embedding.tolist() for embedding in Embeddings_GPT]

---
# Expression

In [26]:
from sklearn.preprocessing   import StandardScaler
scaler = StandardScaler() 

In [27]:
all_in_one["Expression_Normalized"] = scaler.fit_transform(all_in_one["Expression"].values.reshape(-1, 1))  

In [28]:
embeddings_BERT = np.stack(all_in_one["Gene_Marker_BERT"].values)  
scaler = StandardScaler() 
all_in_one["Gene_Marker_BERT_Normalized"] = list(scaler.fit_transform(embeddings_BERT)) 

In [29]:
embeddings_GPT2 = np.stack(all_in_one["Gene_Marker_GPT2"].values)  
scaler = StandardScaler() 
all_in_one["Gene_Marker_GPT2_Normalized"] = list(scaler.fit_transform(embeddings_GPT2)) 

In [30]:
embeddings_GPT3 = np.stack(all_in_one["Gene_Marker_GPT3"].values)  
scaler = StandardScaler() 
all_in_one["Gene_Marker_GPT3_Normalized"] = list(scaler.fit_transform(embeddings_GPT3)) 

# 3- `End`

In [31]:
process_barcode = "process_3"
export_name = "all_in_one" 
with open(os.path.join(access_data_path(f"{process_barcode}/data/preprocessed", f"{export_name}" + ".pkl"))  , 'wb') as file: 
    pickle.dump(all_in_one, file)   