In [1]:
import os
import pandas as pd

def merge_csv_files(folder_path):
    # Get a list of all CSV files in the specified folder
    csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

    # Initialize an empty DataFrame to store merged data
    merged_df = pd.DataFrame(columns=["question", "context", "score"])

    # Iterate through each CSV file
    for file in csv_files:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(os.path.join(folder_path, file))

        # Extract required columns if they exist in the DataFrame
        if "question" in df.columns and "context" in df.columns and "score" in df.columns:
            # Append only the required columns to the merged DataFrame
            merged_df = pd.concat([merged_df, df[["question", "context", "score"]]], ignore_index=True)
        else:
            print(f"Skipping {file} as it does not contain all required columns.")

    # Write the merged DataFrame to a new CSV file
    merged_df.to_csv("merged_training_data.csv", index=False)
    print("Merged CSV file has been saved.")

# Specify the folder path where CSV files are located
training_folder_path = "training"

# Call the function to merge CSV files
merge_csv_files(training_folder_path)


Merged CSV file has been saved.


In [2]:
%pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [3]:
from sentence_transformers import CrossEncoder
model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df=pd.read_csv("merged_training_data.csv")

In [5]:
df.head()

Unnamed: 0,question,context,score
0,what is Model Registry,models. We will now move on to the other criti...,0
1,what is Model Registry,relevant elements of the context of your syste...,0
2,what is Model Registry,model format abstraction and Model Registry c...,0
3,what is Model Registry,"models: sklearn, XGBoost, TensorFlow, H20, fas...",0
4,what is Model Registry,Introducing Model Registry 95\r\nIn the ML...,0


In [7]:
from sentence_transformers import InputExample
train_samples=[]
for _, row in df.iterrows():
    train_samples.append(InputExample(texts=[row['question'], row['context']], label=row['score']))
    train_samples.append(InputExample(texts=[row['context'], row['question']], label=row['score']))

In [8]:
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
# We add an evaluator, which evaluates the performance during training
evaluator = CECorrelationEvaluator.from_input_examples(train_samples, name="sts-dev")

In [16]:
train_batch_size = 32
num_epochs = 25

In [17]:
from torch.utils.data import DataLoader
# We create a DataLoader to load our train samples
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

In [18]:
import math
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
# logger.info("Warmup-steps: {}".format(warmup_steps))
# Train the model
model.fit(
    train_dataloader=train_dataloader,
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=10000,
    warmup_steps=warmup_steps,
    # output_path=model_save_path,
    # use_amp=True,
)

Iteration: 100%|██████████| 3/3 [00:07<00:00,  2.48s/it]
Iteration: 100%|██████████| 3/3 [00:06<00:00,  2.23s/it]
Iteration: 100%|██████████| 3/3 [00:06<00:00,  2.23s/it]
Iteration: 100%|██████████| 3/3 [00:06<00:00,  2.07s/it]
Iteration: 100%|██████████| 3/3 [00:06<00:00,  2.05s/it]
Iteration: 100%|██████████| 3/3 [00:06<00:00,  2.22s/it]
Iteration: 100%|██████████| 3/3 [00:06<00:00,  2.21s/it]
Iteration: 100%|██████████| 3/3 [00:06<00:00,  2.05s/it]
Iteration: 100%|██████████| 3/3 [00:07<00:00,  2.55s/it]
Iteration: 100%|██████████| 3/3 [00:06<00:00,  2.21s/it]
Iteration: 100%|██████████| 3/3 [00:07<00:00,  2.42s/it]
Iteration: 100%|██████████| 3/3 [00:06<00:00,  2.32s/it]
Iteration: 100%|██████████| 3/3 [00:06<00:00,  2.24s/it]
Iteration: 100%|██████████| 3/3 [00:06<00:00,  2.27s/it]
Iteration: 100%|██████████| 3/3 [00:06<00:00,  2.03s/it]
Iteration: 100%|██████████| 3/3 [00:05<00:00,  1.98s/it]
Iteration: 100%|██████████| 3/3 [00:06<00:00,  2.30s/it]
Iteration: 100%|██████████| 3/3

In [19]:
model.save("models/cross-encoder-trained")

In [30]:
m=4 if len(df)>4 else len(df)
df=df[:m]

In [31]:
context=""
for _, row in df.iterrows():
    context+=row['context']+"\n\n"

In [32]:
len(context)

1069