In [3]:
pip install transformers torch scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.4.1.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.4.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.4.1.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [2]:
# Model definition
class BertRegressor(torch.nn.Module):
    def __init__(self):
        super(BertRegressor, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dense1 = torch.nn.Linear(self.bert.config.hidden_size, 4 * self.bert.config.hidden_size, bias=False)
        self.dense2 = torch.nn.Linear(4 * self.bert.config.hidden_size, self.bert.config.hidden_size, bias=False)
        self.score = torch.nn.Linear(self.bert.config.hidden_size, 1, bias=False)


        # self.regressor = torch.nn.Linear(self.bert.config.hidden_size, 1)  # Predicting a single value

    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        pooled_output = outputs.pooler_output
        
        hideen_out =  self.dense1(pooled_output)
        hideen_out = self.dense2(hideen_out)
        
        return      self.score(hideen_out)
        

In [None]:
# Model definition
# from transformers import LlamaModel
# from transformers import LlamaTokenizer
# class LALMARegressor(torch.nn.Module):
#     def __init__(self):
#         super(LALMARegressor, self).__init__()
#         self.model = LlamaModel.from_pretrained('enoch/llama-65b-hf')
#         self.regressor = torch.nn.Linear(self.model.config.hidden_size, 1)  # Predicting a single value
    
#     def forward(self, input_ids, attention_mask):
#         outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
#         pooled_output = outputs.pooler_output
#         return self.regressor(pooled_output)

In [3]:


# Example dataset
def prepare_dataframe (file_name):
        data = pd.read_csv(file_name)
        # some rows have no description, fill blank to avoid Null
        data = data.fillna(' ')

        d = {'text': (data['title']).tolist(), 'label': data['storypoint'].tolist()}
        
        print("Input data feed ::: ",d['text'][0])
        return pd.DataFrame(data=d)

df = prepare_dataframe("./sp_dataset/marked_data/appceleratorstudio.csv")
texts = df["text"].tolist() # ["This is a sample text.", "Another example text goes here."]
targets = df["label"].tolist() # Numeric values you aim to predict

# Split the dataset into training and validation sets
train_texts, val_texts, train_targets, val_targets = train_test_split(texts, targets, test_size=0.2)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# tokenizer = LlamaTokenizer.from_pretrained("enoch/llama-65b-hf")

# Custom dataset
class TextDataset(Dataset):
    def __init__(self, texts, targets):
        self.texts = texts
        self.targets = targets
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        target = self.targets[idx]
        inputs = tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.float)
        }

# Load data
train_dataset = TextDataset(train_texts, train_targets)
val_dataset = TextDataset(val_texts, val_targets)

train_loader = DataLoader(train_dataset, batch_size=15, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=15)

model = BertRegressor()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training
optimizer = AdamW(model.parameters(), lr=5e-5)
mse_loss = torch.nn.MSELoss()

def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model = model.train()
    losses = []
    for d in data_loader:
        optimizer.zero_grad()

        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['targets'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        loss = loss_fn(outputs.squeeze(-1), targets)
        losses.append(loss.item())
        
        loss.backward()
        optimizer.step()

    return np.mean(losses)

def evaluate(model, data_loader, loss_fn, device):
    model = model.eval()
    losses = []
    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['targets'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.squeeze(-1), targets)
            losses.append(loss.item())
    return np.mean(losses)

model_state_path = "./bert_classify/"
# Simplified training loop
for epoch in range(20):  # Example: 3 epochs
    train_loss = train_epoch(model, train_loader, mse_loss, optimizer, device)
    val_loss = evaluate(model, val_loader, mse_loss, device)

    torch.save(model.state_dict(), model_state_path+"bert_clasify_epo_"+str(epoch)+".pth")
    print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')


Input data feed :::  Add CA against object literals in function invocations




Epoch 1, Train Loss: 12.9638, Val Loss: 8.2530
Epoch 2, Train Loss: 11.5827, Val Loss: 7.0798
Epoch 3, Train Loss: 9.8386, Val Loss: 7.2741
Epoch 4, Train Loss: 7.4384, Val Loss: 7.4619
Epoch 5, Train Loss: 5.7182, Val Loss: 8.9034
Epoch 6, Train Loss: 3.8868, Val Loss: 8.7257
Epoch 7, Train Loss: 3.0316, Val Loss: 8.4452
Epoch 8, Train Loss: 2.4456, Val Loss: 9.1925
Epoch 9, Train Loss: 2.2657, Val Loss: 8.1639


KeyboardInterrupt: 

In [6]:
from transformers import PreTrainedTokenizer
# from custom_transformers_interpret import  SequenceClassificationExplainer


def get_bertsp_pipeline(text: str) :
    global DEVICE
    model ='bert-base-uncased'  #'gpt2' #"MickyMike/0-GPT2SP-appceleratorstudio"
    # config = BertConfig(num_labels=1, pad_token_id=0)
    model = BertRegressor()
    state_dict = torch.load("./bert_classify/bert_clasify_epo_8.pth",map_location='cpu')
    model.load_state_dict(state_dict=state_dict ,strict=False)
    # gpt2sp.to(DEVICE)
    model.eval()

    tokenizer:PreTrainedTokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    # tokenizer.pad_token = '[PAD]'
    
    d = tokenizer(text,return_tensors="pt")
    input_ids = d['input_ids']#.to(device)
    attention_mask = d['attention_mask']#.to(device)
    # targets = d['targets'].to(device)
    with torch.no_grad():
        outs = model(input_ids=input_ids,attention_mask=attention_mask)

    # explainer = SequenceClassificationExplainer(model,tokenizer)
    # word_att = explainer(text)
    # top_token = get_top_token(word_att)
    # print("top token :",str(top_token[0]))

    return outs

outsss = get_bertsp_pipeline('Parameter content assist showing HTML tags')
print("out : ", outsss)

out :  tensor([[7.4228]])
