In [55]:
import os
import pandas as pd

def merge_csv_files(folder_path):
    # Get a list of all CSV files in the specified folder
    csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

    # Initialize an empty DataFrame to store merged data
    merged_df = pd.DataFrame(columns=["question", "context", "score"])

    # Iterate through each CSV file
    for file in csv_files:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(os.path.join(folder_path, file))

        # Extract required columns if they exist in the DataFrame
        if "question" in df.columns and "context" in df.columns and "score" in df.columns:
            # Append only the required columns to the merged DataFrame
            merged_df = pd.concat([merged_df, df[["question", "context", "score"]]], ignore_index=True)
        else:
            print(f"Skipping {file} as it does not contain all required columns.")

    # Write the merged DataFrame to a new CSV file
    merged_df.to_csv("merged_training_data.csv", index=False)
    print("Merged CSV file has been saved.")

# Specify the folder path where CSV files are located
training_folder_path = "training"

# Call the function to merge CSV files
merge_csv_files(training_folder_path)


Merged CSV file has been saved.


In [56]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/qnli-distilroberta-base')
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/qnli-distilroberta-base')

features = tokenizer([('How many people live in Berlin?', 'What is the size of New York?'),('Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.', 'New York City is famous for the Metropolitan Museum of Art.')],  padding=True, truncation=True, return_tensors="pt")

model.eval()
with torch.no_grad():
    scores = torch.nn.functional.sigmoid(model(**features).logits)
    print(scores)




tensor([[0.0009],
        [0.0231]])


In [57]:
import torch
import torch.nn as nn
from transformers import RobertaForSequenceClassification, RobertaTokenizer

class CustomRobertaClassifier(nn.Module):
    def __init__(self, num_labels):
        super(CustomRobertaClassifier, self).__init__()
        self.roberta = RobertaForSequenceClassification.from_pretrained("cross-encoder/qnli-distilroberta-base")
        # self.tokenizer = RobertaTokenizer.from_pretrained("cross-encoder/qnli-distilroberta-base")
        
        # Freeze the RoBERTa model weights
        for param in self.roberta.parameters():
            param.requires_grad = False
        
        # Make the classifier part trainable
        for param in self.roberta.classifier.parameters():
            param.requires_grad = True
        
        # Modify the output layer to have num_labels output neurons
        self.roberta.classifier.out_proj = nn.Linear(self.roberta.config.hidden_size, num_labels)
        
    def forward(self, input_ids,attention_mask):
        # Forward pass through RoBERTa model
        
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask).logits
        
        return outputs

Custom_model=CustomRobertaClassifier(1)

In [58]:
Custom_model

CustomRobertaClassifier(
  (roberta): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-5): 6 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
               

In [59]:
from prettytable import PrettyTable
def count_parameters(model):
    table = PrettyTable(["Modules","Parameters"])
    total_params = 0
    total_trainganle_params = 0
    for name, parameter in model.named_parameters():
        params = parameter.numel()
        total_params+=params
        table.add_row([name, params])
        if not parameter.requires_grad: 
            continue
        else:
            total_trainganle_params+=params
        
    print(table)
    print(f"Total Trainable Params: {total_trainganle_params}")
    print(f"Total Params: {total_params}")
    print(f"Ratio : {total_trainganle_params/total_params}")
    return total_params

In [60]:
count_parameters(model)

+-----------------------------------------------------------+------------+
|                          Modules                          | Parameters |
+-----------------------------------------------------------+------------+
|         roberta.embeddings.word_embeddings.weight         |  38603520  |
|       roberta.embeddings.position_embeddings.weight       |   394752   |
|      roberta.embeddings.token_type_embeddings.weight      |    768     |
|            roberta.embeddings.LayerNorm.weight            |    768     |
|             roberta.embeddings.LayerNorm.bias             |    768     |
|    roberta.encoder.layer.0.attention.self.query.weight    |   589824   |
|     roberta.encoder.layer.0.attention.self.query.bias     |    768     |
|     roberta.encoder.layer.0.attention.self.key.weight     |   589824   |
|      roberta.encoder.layer.0.attention.self.key.bias      |    768     |
|    roberta.encoder.layer.0.attention.self.value.weight    |   589824   |
|     roberta.encoder.lay

82119169

In [61]:
count_parameters(model=Custom_model)

+-------------------------------------------------------------------+------------+
|                              Modules                              | Parameters |
+-------------------------------------------------------------------+------------+
|         roberta.roberta.embeddings.word_embeddings.weight         |  38603520  |
|       roberta.roberta.embeddings.position_embeddings.weight       |   394752   |
|      roberta.roberta.embeddings.token_type_embeddings.weight      |    768     |
|            roberta.roberta.embeddings.LayerNorm.weight            |    768     |
|             roberta.roberta.embeddings.LayerNorm.bias             |    768     |
|    roberta.roberta.encoder.layer.0.attention.self.query.weight    |   589824   |
|     roberta.roberta.encoder.layer.0.attention.self.query.bias     |    768     |
|     roberta.roberta.encoder.layer.0.attention.self.key.weight     |   589824   |
|      roberta.roberta.encoder.layer.0.attention.self.key.bias      |    768     |
|   

82119169

In [62]:
591361/82119169 *100

0.7201254070167222

In [63]:
#dataloader
from torch.utils.data import DataLoader
train_dataset=pd.read_csv("merged_training_data.csv")

In [64]:
train_dataset.head()

Unnamed: 0,question,context,score
0,what is Model Registry,models. We will now move on to the other criti...,0
1,what is Model Registry,relevant elements of the context of your syste...,0
2,what is Model Registry,model format abstraction and Model Registry c...,0
3,what is Model Registry,"models: sklearn, XGBoost, TensorFlow, H20, fas...",0
4,what is Model Registry,Introducing Model Registry 95\r\nIn the ML...,0


In [85]:
from torch.utils.data import DataLoader
import pandas as pd
from torch.utils.data import Dataset
from transformers import RobertaTokenizer
import torch

class CustomImageDataset(Dataset):
    def __init__(self, tokenizer, max_length=512):
        self.img_labels = pd.read_csv("merged_training_data.csv")
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        text = list(self.img_labels.iloc[idx, 0:2])
        label = torch.tensor(self.img_labels.iloc[idx, 2], dtype=torch.int)

        # Tokenize and pad the sequences
        token = self.tokenizer(text, padding='max_length', max_length=self.max_length, truncation=True, return_tensors="pt")
        
        # Flatten the tokens
        inputs_ids=token['input_ids']
        attention_mask = token['attention_mask']
        # print(inputs_ids.shape)
        inputs_ids = inputs_ids.flatten()
        
        attention_mask = attention_mask.flatten()
        return inputs_ids,attention_mask, label

# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("cross-encoder/qnli-distilroberta-base")

# Create the dataset and dataloader
train_dataset = CustomImageDataset(tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Display image and label.
input_ids,attention_mask, train_labels = next(iter(train_dataloader))
print(f"input batch shape: {input_ids.shape}")
print(f"attention batch shape: {attention_mask.shape}")
print(f"Label: {train_labels.shape}")




input batch shape: torch.Size([39, 1024])
attention batch shape: torch.Size([39, 1024])
Label: torch.Size([39])


In [86]:
Custom_model(input_ids=input_ids,attention_masks=attention_mask)

RuntimeError: The expanded size of the tensor (1024) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [39, 1024].  Tensor sizes: [1, 514]

In [None]:
from sentence_transformers import CrossEncoder

In [None]:

model3 = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")