<a href="https://colab.research.google.com/github/OmkarK-7/Atten./blob/main/finetune_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import random
import json
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from torch.utils.data import Dataset

reference_table = pd.read_csv('reference-table.csv')
reference_table_dict = reference_table.to_dict(orient='records')
reference_table_dict

[{'Stored in Table': 'NEW',
  'Value Name': 'New',
  'Value Description': 'Order has been placed but not yet processed',
  'Column Name ': 'Order Status'},
 {'Stored in Table': 'PND',
  'Value Name': 'Pending',
  'Value Description': 'Order is awaiting further action',
  'Column Name ': 'Order Status'},
 {'Stored in Table': 'PRC',
  'Value Name': 'Processing',
  'Value Description': 'Order is being processed',
  'Column Name ': 'Order Status'},
 {'Stored in Table': 'PCK',
  'Value Name': 'Picked',
  'Value Description': 'Order items have been picked for shipping',
  'Column Name ': 'Order Status'},
 {'Stored in Table': 'PKD',
  'Value Name': 'Packed',
  'Value Description': 'Order has been packed',
  'Column Name ': 'Order Status'},
 {'Stored in Table': 'SHP',
  'Value Name': 'Shipped',
  'Value Description': 'Order has been shipped out from the warehouse',
  'Column Name ': 'Order Status'},
 {'Stored in Table': 'OOD',
  'Value Name': 'Out for Delivery',
  'Value Description': 'Order i

In [4]:
import os
import json

data_model = {}

# Path to the data model directory
data_model_dir = 'https://github.com/OmkarK-7/Atten./tree/8722a35c6f03b603da3a9282ea6ba47623df0ac8/DataModel/ecom_tables_and_columns_new_with_dimensions'

# Traverse through the subfolders and read JSON files
for subdir, _, files in os.walk(data_model_dir):
    for file in files:
        if file.endswith('.json'):
            file_path = os.path.join(subdir, file)
            with open(file_path, 'r') as f:
                table_data = json.load(f)
                table_name = table_data['table_name']
                data_model[table_name] = table_data

print(json.dumps(data_model, indent=4))  # Print to verify the loaded data

{}


In [None]:
# Loading pre-trained model( using gpt-2 for this case) and tokenizer.
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Create training data this will take a lot of time and not fully complete.
train_data = [
    {"input": "What is the average duration from order placement to shipment across different product categories, segmented by customer regions and order volume?",
     "output": {
         "tables": ["order", "product", "customer"],
         "columns": ["order.order_date", "order.shipped_date", "product.category", "customer.region", "order.order_volume"],
         "relationships": [{"from_table": "order", "from_column": "product_id", "to_table": "product", "to_column": "product_id"},
                           {"from_table": "order", "from_column": "customer_id", "to_table": "customer", "to_column": "customer_id"}]
     },
     "input": "How can I determine the peak shopping hours by analyzing the times at which customers complete their purchases on our e-commerce platform?",
     "output": {
         "tables": ["order", "product", "customer"],
         "columns": ["order.order_date", "order.shipped_date", "product.category", "customer.region", "order.order_volume"],
         "relationships": [{"from_table": "order", "from_column": "product_id", "to_table": "product", "to_column": "product_id"},
                           {"from_table": "order", "from_column": "customer_id", "to_table": "customer", "to_column": "customer_id"}]
     }

     },
    # Have to add more question-answer pairs
]

# Tokenize and prepare data for training
def preprocess_data(data):
    inputs = tokenizer([item['input'] for item in data], return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = tokenizer([str(item['output']) for item in data], return_tensors='pt', truncation=True, padding=True, max_length=512)
    return inputs, outputs

inputs, outputs = preprocess_data(train_data)

# Training arguments
training_args = TrainingArguments(output_dir='./results',
                                  num_train_epochs=3,
                                  per_device_train_batch_size=4,
                                  save_steps=10_000,
                                  save_total_limit=2,
                                  )

trainer = Trainer(model=model, args=training_args, train_dataset=inputs, eval_dataset=outputs,)
trainer.train()

**Creating the Validation Dataset**

You can create a validation dataset from the provided questions or use some other questions if available. The process involves selecting a subset of the questions, manually mapping them to the relevant columns and tables, and then using this subset for validation.

In [None]:
# Load the Questions
questions_df = pd.read_csv('questions.csv')

# Select a Subset for Validation
# Assuming the 'questions.csv' has a column named 'Question'
validation_subset = questions_df.sample(n=10)  # Select 10 random questions for validation

# Step 3: Manually Map Selected Questions
# For demonstration purposes, we are creating dummy mappings. In practice, this should be done manually.
manual_mappings = {
    "What is the average duration from order placement to shipment across different product categories, segmented by customer regions and order volume?": {
        "tables": ["order", "product", "customer"],
        "columns": ["order.order_date", "order.shipped_date", "product.category", "customer.region", "order.order_volume"],
        "relationships": [
            {"from_table": "order", "from_column": "product_id", "to_table": "product", "to_column": "product_id"},
            {"from_table": "order", "from_column": "customer_id", "to_table": "customer", "to_column": "customer_id"}
        ]
    },
    "How can I determine the peak shopping hours by analyzing the times at which customers complete their purchases on our e-commerce platform?": {
        "tables": ["order", "customer"],
        "columns": ["order.order_time", "customer.customer_id"],
        "relationships": [
            {"from_table": "order", "from_column": "customer_id", "to_table": "customer", "to_column": "customer_id"}
        ]
    },
    # Have to add more mappings for the selected questions
}

# Prepare the Validation Dataset
validation_data = []
for question in validation_subset['Question']:
    if question in manual_mappings:
        validation_data.append({
            "question": question,
            "mapping": manual_mappings[question]
        })

# Save the Validation Dataset
with open('validation_dataset.json', 'w') as f:
    json.dump(validation_data, f, indent=4)

print("Validation dataset created and saved as 'validation_dataset.json'")

In [None]:
# Load the validation dataset
with open('validation_dataset.json', 'r') as f:
    validation_data = json.load(f)

In [None]:
# Load pre-trained model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Prepare validation data
def preprocess_data(data):
    inputs = [item['question'] for item in data]
    outputs = [json.dumps(item['mapping']) for item in data]

    input_encodings = tokenizer(inputs, truncation=True, padding=True, max_length=512, return_tensors='pt')
    output_encodings = tokenizer(outputs, truncation=True, padding=True, max_length=512, return_tensors='pt')

    return input_encodings, output_encodings

validation_inputs, validation_outputs = preprocess_data(validation_data)


In [None]:
class ValidationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

# Create validation dataset
val_dataset = ValidationDataset(validation_inputs, validation_outputs)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=val_dataset,
)

# Evaluate the model
eval_results = trainer.evaluate()

print("Evaluation results:", eval_results)

# Using the Finetuned Model

After fine-tuning the model and validated its performance, we should set up a pipeline to interact with the model and ask questions.

In [None]:
# Load the fine-tuned model and tokenizer
model_path = './results/checkpoint-last'  # Model's save path
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

In [None]:
def ask_question(question):
    # Tokenize the input question
    inputs = tokenizer(question, return_tensors='pt', truncation=True, padding=True, max_length=512)

    # Generate the model's output
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], max_length=512, num_return_sequences=1)

    # Decode the output
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return output_text

In [None]:
def parse_output(output_text):
    try:
        output_data = json.loads(output_text)
        return output_data
    except json.JSONDecodeError:
        return {"error": "Failed to parse model output."}

# Example usage
question = "Give me top 5 customers with most number of orders."
raw_output = ask_question(question)
parsed_output = parse_output(raw_output)

print(json.dumps(parsed_output, indent=4))