# Finetuning Using Google Gemma's Model

In [None]:
!pip install nltk
!pip install datasets
!pip install transformers[torch]
!pip install tokenizers
!pip install evaluate
!pip install rouge_score
!pip install sentencepiece
!pip install huggingface_hub

In [None]:
import nltk
import evaluate
import numpy as np
import os
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
os.environ["HF_TOKEN"] = "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"

In [None]:
my_question = "Identify the product in the sentence.Sentence:I brought iphone 15 pro 256gb in india for 145520inr."
inputs = "Please answer to this question: " + my_question
inputs = tokenizer(inputs, return_tensors="pt")
outputs = model.generate(**inputs)
answer = tokenizer.decode(outputs[0])
print(answer)



<pad> iPhone 15 pro 256gb</s>


In [None]:
my_question = "Find category of the given product. Category: iphone 15 pro 256gb"
inputs = "Please answer to this question: " + my_question
inputs = tokenizer(inputs, return_tensors="pt")
outputs = model.generate(**inputs)
answer = tokenizer.decode(outputs[0])
print(answer)

<pad> Electronics</s>


In [None]:
my_question="Find the cheapest country among France=₹8822, Australia=₹20367, Japan=₹23625, China=₹49062, United Kingdom=₹54448, Indonesia=₹55765, Germany=₹69748, United States=₹71286, Thailand=₹71765, Canada=₹97881"
inputs = "Please answer to this question: " + my_question
inputs = tokenizer(inputs, return_tensors="pt")
outputs = model.generate(**inputs)
answer = tokenizer.decode(outputs[0])
print(answer)

<pad> France</s>


In [None]:
my_question= "Recommend the cheapest country based on the given input to buy the product.Product=iphone 15 pro,price:'$85',Inr: 9879 INR, Country=usa."
inputs = "Please answer to this question: " + my_question
inputs = tokenizer(inputs, return_tensors="pt")

outputs = model.generate(**inputs,max_new_tokens=50)
answer = tokenizer.decode(outputs[0])
print(answer)

<pad> It is cheaper to purchase iPhone 15 pro in United States at a cost of 85 than in United States.</s>


In [None]:
my_question="Recommend the vendor based on given data. Wayfair=(52222.0,210,1.9), Lazada=(15.5,2303,4.7), Etsy=(7.41,3937,4.0), Shopify=(4.25,3782,2.2), Etsy=(2.1,7112,2.1)'"
inputs = "Please answer to this question: " + my_question
inputs = tokenizer(inputs, return_tensors="pt")
outputs = model.generate(**inputs)
answer = tokenizer.decode(outputs[0])
print(answer)

<pad>We suggest you to buy from vendor Wayfair as they have: 1. Best rating of 1.9


In [None]:
my_question = 'Provide me the delivery details based on the input. Country=Australia, base price= 556.88 AUD, Delivery cost= 55.69 AUD, tax= 94.67 AUD,total_price_local=707.2399999999999 AUD,total_price_inr=38898.2'
inputs = "Please answer to this question: " + my_question
inputs = tokenizer(inputs, return_tensors="pt")
outputs = model.generate(**inputs,max_new_tokens=50)
answer = tokenizer.decode(outputs[0])
print(answer)

<pad>The expected on-hand product price from Australia, including delivery and tax, is 556.88 AUD (base price) + 55.69 AUD (delivery cost) + 94.67 AUD (Custom Duty tax of 17


In [None]:
my_question= " What is the price of the product in Germany? The price of United States is ₹5356, Canada is ₹17848, United Kingdom is ₹18081, Indonesia is ₹47288, France is ₹53398, Australia is ₹68053, Thailand is ₹76484, China is ₹87357, Germany is ₹87593, Japan is ₹89796."
inputs = "Please answer to this question: " + my_question
inputs = tokenizer(inputs, return_tensors="pt")
outputs = model.generate(**inputs)
answer = tokenizer.decode(outputs[0])
print(answer)

<pad> The price of the product in Germany is <unk> 87593</s>


In [None]:
my_question= "Extract the product name ans the price in the form of python dictionary from the given invoice. Invoice: INVOICE #US-2060 INVOICE DATE 10/4/2024 BILL TO Customer 5495 129 church stSuite 11, Chicago 46810 QTY DESCRIPTION UNIT PRICE AMOUNT  1 dresser wireless 102.42 102.421 toaster heavy-duty 434.11 434.111 drill compact 30.46 30.46 Subtotal 566.99 Sales Tax 35.44 TOTAL 602.43 Terms and conditions Confidentiality: All confidential information will be kept confidential by both parties."
inputs = "Please answer to this question: " + my_question
inputs = tokenizer(inputs, return_tensors="pt")
outputs = model.generate(**inputs,max_new_tokens=50)
answer = tokenizer.decode(outputs[0])
print(answer)

<pad> <unk> 'dresser wireless': 102.42, 'toaster heavy-duty': 434.11, 'drill compact': 30.46, 'Total': 602.43125<unk></s>


In [None]:
my_question= "Recommend the best country to buy the products in the bill based on the data. Product Data: Huawei Nova 10 Pro, country=Australia, India price=₹31183, country price=₹17527, percentage difference=43.79%."
inputs = "Please answer to this question: " + my_question
inputs = tokenizer(inputs, return_tensors="pt")
outputs = model.generate(**inputs,max_new_tokens=50)
answer = tokenizer.decode(outputs[0])
print(answer)

<pad> It is cheaper to purchase Huawei Nova 10 Pro in Australia at a cost of 17527 compared to India at 31183. You save 43.79%.</s>


In [None]:
my_question= "Which country has the minimum cost for Apple iPad mini (2021), United Kingdom or France or United States? The Indian value of Australia= ₹14627, Canada= ₹11327, China= ₹36005, France= ₹43624, Germany= ₹40156, Indonesia= ₹31793, Japan= ₹22321, Thailand= ₹47069, United Kingdom= ₹13776, United States= ₹45334."
inputs = "Please answer to this question: " + my_question
inputs = tokenizer(inputs, return_tensors="pt")
outputs = model.generate(**inputs,max_new_tokens=50)
answer = tokenizer.decode(outputs[0])
print(answer)

<pad> United Kingdom has the minimum cost for Apple iPad mini (2021).</s>


In [None]:
model.to('cpu')

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

def count_trainable_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print_trainable_parameters(model)
count_trainable_params(model)

trainable params: 247577856 || all params: 247577856 || trainable%: 100.0


247577856

In [None]:
import json
from datasets import Dataset

# Load the dataset from the JSON file
with open("datasets.json", "r",encoding='utf-8') as f:
    data = json.load(f)
  # Assuming each dictionary in the list has the same keys
column_names = list(data[0].keys())
print(column_names)
data_dict = {col: [d[col] for d in data] for col in column_names}

data = Dataset.from_dict(data_dict)
len(data)

['instruction', 'output']


10000

In [None]:
# We prefix our tasks with "answer the question"
prefix = "Please answer this question: "

# Define the preprocessing function

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["instruction"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)

   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["output"],
                      max_length=512,
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [None]:
tokenized_dataset = data.map(preprocess_function, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

   return result

In [None]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 8
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   evaluation_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   predict_with_generate=True,
   push_to_hub=False
)



In [None]:
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset,
   eval_dataset=tokenized_dataset,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.1196,0.050842,0.800537,0.632894,0.800477,0.800628
2,0.0764,0.033686,0.805407,0.638455,0.805373,0.805468
3,0.0601,0.02815,0.806378,0.639557,0.806317,0.806406




TrainOutput(global_step=3750, training_loss=0.09973246841430664, metrics={'train_runtime': 5088.2006, 'train_samples_per_second': 5.896, 'train_steps_per_second': 0.737, 'total_flos': 4620111353634816.0, 'train_loss': 0.09973246841430664, 'epoch': 3.0})

In [None]:
last_checkpoint = "/content/results/2500-2"

finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model.to('cuda')

In [None]:
my_question = "Extract the product name ans the price in the form of python dictionary from the given invoice.Invoice:invoice LOGO FROM US-001 East Repair Inc_ 11/02/2019 1912 Harvest Lane 2312/2019 New NY 12210 26/02/2019 BILL TO SHIP TO John Smith John Smith 2 Court Square 3787 Pineview Drive New York NY 12210 Cambridge, MA 12210 QTY DESCRIPTION UNIT PRICE AMOUNT Front rear brake cables 100.00 100.00 New set of pedal arms 15.00 30.00 Labor 3hrs 5.00 15.00 Subtotal 145.00 Sales Tax 6.25% 9.06 TOTAL S154.06 TERMS & CONDITIONS Payment is due within 15 days Please make checks payable to: East Repair Inc_ York, and "
inputs = "Please answer to this question: " + my_question
inputs = tokenizer(inputs, return_tensors="pt").to('cuda')
outputs = model.generate(**inputs,max_new_tokens=50)
answer = tokenizer.decode(outputs[0]).replace('<unk>','').replace('</s>','').replace('<pad>','').replace('<s>','')
print(answer)

Method: Answer: Australia:217.9728, Canada:22.53555, China:2173.63528, France:2174.09728, Germany:217.98482, Great Britain:217.97


In [None]:
 datasets=[]
 recommendations = [
        "We highly recommend you buying {product_name} in {country1_name} because,\n 1.Significant Cost Savings: The price of {country1_name} is {country1_price} ({country1_price_inr} INR) which is very less than other price.\n 2.Budget Management: Choosing lower-priced option i.e, on average 16% less helps in healthier budget. \n 3.Other countries with less cost :{others} "
]
products = ["Smartphone", "Laptop", "Headphones", "Smartwatch", "Camera", "Television", "Tablet", "Gaming console", "Fitness tracker", "Wireless earbuds", "Portable speaker", "Drone", "External hard drive", "E-reader", "Action camera", "Bluetooth keyboard", "VR headset", "Robot vacuum cleaner", "Coffee maker", "Electric toothbrush", "Bluetooth speaker", "Home security camera", "Air purifier", "Desk lamp", "Projector", "Electric kettle", "Mini fridge", "Power bank", "Printers", "Monitor"]
import random
import json
countries = ["Australia", "Canada", "China", "France", "Indonesia", "Japan","Singapore","Thailand", "United Kingdom", "United States"]

# Conversion rates to INR
conversion_rates = {
    'Australia': 53.04,  # Conversion rate to INR for AUD
    'Canada': 60.7,  # Conversion rate to INR for CAD
    'China': 11.30,  # Conversion rate to INR for CNY
    'France': 89.93,  # Conversion rate to INR for EUR
    'Indonesia': 0.0059,  # Conversion rate to INR for IDR
    'Japan': 0.54,
     'Singapore': 61.89,# Conversion rate to INR for JPY
    'Thailand': 2.27,  # Conversion rate to INR for THB
    'United Kingdom': 104.16,  # Conversion rate to INR for GBP
    'United States': 84.15  # Conversion rate to INR for USD
}

# Currency symbols for each country
currency_symbols = {
    'Australia': '$',
    'Canada': '$',
    'China': '¥',
    'France': '€',
    'Indonesia': 'Rp',
    'Japan': '¥',
    'Singapore': '$',
    'Thailand': '฿',
    'United Kingdom': '£',
    'United States': '$'
}

for i in range(1000):


  p=random.choices(products)[0]
  c=random.choices(countries)[0]
  orignal_price=(random.randint(1000, 100000))
  original_price_inr = conversion_rates[c] * orignal_price
  orignal_price=currency_symbols[c] +str(random.randint(1, 100000))
  other=tuple([random.choices(countries)[0] for x in range(2)])
  input_data=f"Product={p}"+f",Country Rates:{orignal_price,original_price_inr}, Country={c},others={other}"
  template=random.choices(recommendations)[0]
  output = template.format(product_name=p,country1_name=c,country1_price=f"{orignal_price}",country1_price_inr=original_price_inr,others=str(other[0])+", "+other[1])

  datasets.append({"instruction": "Recommend the country based on the given input. "+str( input_data), "output": output})
with open("datasets.json", "w", encoding="utf-8") as f:
    json.dump(datasets, f, indent=4,ensure_ascii=False)
print(len(datasets))
print(datasets[-1])

1000
{'instruction': "Recommend the country based on the given input. Product=Headphones,Country Rates:('$16300', 115149.84), Country=Australia,others=('Canada', 'Thailand')", 'output': 'We highly recommend you buying Headphones in Australia because,\n 1.Significant Cost Savings: The price of Australia is $16300 (115149.84 INR) which is very less than other price.\n 2.Budget Management: Choosing lower-priced option i.e, on average 16% less helps in healthier budget. \n 3.Other countries with less cost :Canada, Thailand '}


In [None]:
import json
import random

# List of countries
countries = ["Australia", "Canada", "China", "France", "Germany", "Great Britain", "Indonesia", "Japan", "Singapore", "Thailand", "United Kingdom", "United States"]

# List of products
products = [
    # Apple iPhones
    "iPhone 14 Pro Max", "iPhone 14 Pro", "iPhone 14 Plus", "iPhone 14",
    "iPhone 13 Pro Max", "iPhone 13 Pro", "iPhone 13 mini", "iPhone 13",
    "iPhone 12 Pro Max", "iPhone 12 Pro", "iPhone 12 mini", "iPhone 12",
    "iPhone SE (3rd generation)",

    # Samsung Galaxy
    "Samsung Galaxy S23 Ultra", "Samsung Galaxy S23+", "Samsung Galaxy S23",
    "Samsung Galaxy Z Fold 4", "Samsung Galaxy Z Flip 4", "Samsung Galaxy A54",
    "Samsung Galaxy A34", "Samsung Galaxy M54", "Samsung Galaxy M34",
    "Samsung Galaxy Note 20 Ultra", "Samsung Galaxy S22 Ultra",
    "Samsung Galaxy S22+", "Samsung Galaxy S22",

    # Google Pixel
    "Google Pixel 7 Pro", "Google Pixel 7", "Google Pixel 6 Pro", "Google Pixel 6",

    # OnePlus
    "OnePlus 11R 5G", "OnePlus 11 Pro 5G", "OnePlus 10T 5G", "OnePlus 10 Pro 5G",

    # Xiaomi Mi
    "Xiaomi 12S Ultra", "Xiaomi 12S Pro", "Xiaomi 12S", "Xiaomi 12T Pro",
    "Xiaomi Redmi Note 12 Pro Max", "Xiaomi Redmi Note 12 Pro", "Xiaomi Poco M5s",

    # Huawei Mate
    "Huawei Mate 50 Pro", "Huawei Mate Xs 2", "Huawei Nova 10 Pro",

    # Sony Xperia
    "Sony Xperia 1 IV", "Sony Xperia 5 IV", "Sony Xperia Pro-I",

    # LG G
    "LG G8X ThinQ",

    # Nokia Lumia
    "Nokia Lumia 635",

    # Motorola Moto
    "Motorola Edge 30 Ultra", "Motorola Moto G82",

    # Additional Popular Brands (a few examples)
    "ASUS ROG Phone 6", "Lenovo Legion Phone Duel 3", "Apple iPad mini (2021)",
    "Samsung Galaxy Tab S8 Ultra"
]
# Input text templates
input_templates = [
    "What is the price of {product} in {countries}?",
    "How much does {product} cost in {countries}?",
    "What is the price of {product} in {countries}?",
    "How much does {product} cost in {countries}?",
    'i brought {product} in {countries}'
]

# Generate dataset

for _ in range(1000):
    # Choose a random product
    product = random.choice(products)

    # Choose random number of countries (between 1 and 3)
    num_countries = random.randint(1, 3)
    selected_countries = random.sample(countries, num_countries)

    # Format selected countries
    countries_str = ", ".join(selected_countries)

    # Choose a random input text template
    input_text = random.choice(input_templates).format(product=product, countries=countries_str)

    # Construct output

    datasets.append({"instruction": "Find the product name in the sentence. Sentence:"+str(input_text), "output": product})

with open("datasets.json", "w", encoding="utf-8") as f:
    json.dump(datasets, f, indent=4,ensure_ascii=False)
print(len(datasets))
print(datasets[-1])

2000
{'instruction': 'Find the product name in the sentence. Sentence:What is the price of Samsung Galaxy S22+ in Australia, Singapore?', 'output': 'Samsung Galaxy S22+'}


In [None]:
import json
import random

# Define categories and corresponding products with more items and brand names
categories = {
    "Electronics": [
        "iPhone", "Samsung Galaxy", "MacBook", "Dell Laptop", "Sony TV", "LG TV",
        "HP PC", "iPad", "Microsoft Surface", "Apple Watch", "Nikon Camera",
        "Canon Camera", "Bose Headphones", "JBL Speaker", "DJI Drone", "GoPro",
        "Kindle", "Xbox", "PlayStation", "Fitbit"
    ],
    "Clothing": [
        "Nike T-shirt", "Adidas Shorts", "Levi's Jeans", "Gucci Dress", "North Face Jacket",
        "H&M Sweater", "Zara Skirt", "Ralph Lauren Blouse", "Armani Suit", "Puma Shoes",
        "Under Armour Hoodie", "Columbia Vest", "Calvin Klein Underwear", "Tommy Hilfiger Polo",
        "Patagonia Fleece", "Burberry Coat", "Versace Tracksuit", "Reebok Sweatpants",
        "Gap Socks", "Uniqlo Shirt"
    ],
    "Furniture": [
        "IKEA Table", "Herman Miller Chair", "Ashley Wardrobe", "Tempur-Pedic Bed",
        "West Elm Sofa", "Pottery Barn Desk", "Sauder Bookshelf", "Wayfair Cabinet",
        "Pier 1 Stool", "Crate & Barrel Dresser", "La-Z-Boy Recliner", "Restoration Hardware Bench",
        "CB2 Ottoman", "Anthropologie Mirror", "Havertys Coffee Table", "Raymour & Flanigan Bed Frame",
        "Flexsteel Loveseat", "Room & Board Console", "Article Media Unit", "Blu Dot Shelf"
    ],
    "Home Appliances": [
        "Whirlpool Refrigerator", "GE Air Conditioner", "KitchenAid Mixer", "Dyson Vacuum",
        "Bosch Dishwasher", "Samsung Washing Machine", "LG Dryer", "Panasonic Microwave",
        "Instant Pot", "Breville Toaster", "Cuisinart Food Processor", "Philips Air Fryer",
        "Ninja Blender", "Honeywell Air Purifier", "Frigidaire Freezer", "Tefal Iron",
        "Rowenta Steam Iron", "Hamilton Beach Coffee Maker", "Vitamix Blender", "Miele Oven"
    ]
}

# Generate dataset

for _ in range(1000):
    # Randomly choose a category
    category = random.choice(list(categories.keys()))

    # Randomly choose a product from the chosen category
    product = random.choice(categories[category])

    # Create input and output
    input_text = product
    output_text = category

    # Append to dataset
    datasets.append({"instruction": "Find category of the given product. Category:"+str(input_text), "output": output_text})

with open("datasets.json", "w", encoding="utf-8") as f:
    json.dump(datasets, f, indent=4,ensure_ascii=False)
print(len(datasets))
print(datasets[-1])

3000
{'instruction': 'Find category of the given product. Category:Versace Tracksuit', 'output': 'Clothing'}


In [None]:
import json
import random

countries = ["Australia", "Canada", "China", "France", "Germany", "Indonesia", "Japan", "Thailand", "United Kingdom", "United States"]

for i in range(1000):    # Generate random prices for 10 countries
    prices = [random.randint(1000, 100000) for _ in range(len(countries))]

    # Sort countries by their prices
    sorted_countries_prices = sorted(zip(prices, countries))

    # Separate sorted prices and countries
    sorted_prices = [price for price, country in sorted_countries_prices]
    sorted_countries = [country for price, country in sorted_countries_prices]

    # Construct the input data with sorted prices and countries
    country_price_pairs = [f"{country}=₹{price}" for price, country in sorted_countries_prices]
    input_data = "Find the cheapest country among " + ", ".join(country_price_pairs)

    # The first country in the sorted list is the one with the minimum price
    cheapest_country = sorted_countries[0]
    cheapest_price = sorted_prices[0]

    # Construct the output message
    output = cheapest_country
    datasets.append({ "instruction": input_data, "output": output})

with open("datasets.json", "w", encoding="utf-8") as f:
    json.dump(datasets, f, indent=4,ensure_ascii=False)
print(len(datasets))
print(datasets[-1])

4000
{'instruction': 'Find the cheapest country among Japan=₹1059, Australia=₹43126, Thailand=₹67897, United Kingdom=₹69205, France=₹69518, China=₹83894, Indonesia=₹89053, Canada=₹93964, Germany=₹95947, United States=₹96041', 'output': 'Japan'}


In [None]:
import pandas as pd
import random
# Define the countries and product names
countries = [
    "Australia",
    "Canada",
    "China",
    "France",
    "Germany",
    "Indonesia",
    "Japan",
    "Thailand",

    "United Kingdom",
    "United States"
]

product_names = [
    'iPhone 14 Pro Max',
    'Huawei Mate 50 Pro',
    'Huawei Mate Xs 2',
    'Huawei Nova 10 Pro',
    'Sony Xperia 1 IV',
    'Sony Xperia 5 IV',
    'Sony Xperia Pro-I',
    'LG G8X ThinQ',
    'Nokia Lumia 635',
    'Motorola Edge 30 Ultra',
    'Motorola Moto G82',
    'ASUS ROG Phone 6',
    'Lenovo Legion Phone Duel 3',
    'Apple iPad mini (2021)',
    'Samsung Galaxy Tab S8 Ultra'
]

# Function to generate random prices for all countries
def generate_prices(countries):
    return {country: random.randint(10000, 50000) for country in countries}

# Create a list to store the data

# Generate 10,000 rows of data
for _ in range(1000):
    prices = generate_prices(countries)
    product_name = random.choice(product_names)
    answer_country=random.choice(countries)

    # Formulate the question string
    prices_str = ', '.join([f'{country}= ₹{price}' for country, price in prices.items()])
    question = f"What is the price of {product_name} in {answer_country}?.The price of {prices_str}"

    # Randomly select one country to focus the answer on
    answer = f"The price of {product_name} in {answer_country} is ₹{prices[answer_country]}"

    datasets.append({ "instruction": question, "output": answer})

with open("datasets.json", "w", encoding="utf-8") as f:
    json.dump(datasets, f, indent=4,ensure_ascii=False)
print(len(datasets))
print(datasets[-1])

5000
{'instruction': 'What is the price of Apple iPad mini (2021) in Germany?.The price of Australia= ₹14476, Canada= ₹30525, China= ₹19396, France= ₹28826, Germany= ₹30982, Indonesia= ₹14548, Japan= ₹49234, Thailand= ₹14557, United Kingdom= ₹11391, United States= ₹47001', 'output': 'The price of Apple iPad mini (2021) in Germany is ₹30982'}


In [None]:
import random
for __ in range(1000):
  # Step 1: Generate 5 random prices and ratings
  prices = [int(random.uniform(100, 10000)) for _ in range(5)]
  ratings = [round(random.uniform(1, 5),1) for _ in range(5)]

  # Step 2: Calculate the score for each vendor
  mi = min(prices)
  ma = max(prices)
  range_price = ma - mi if ma != mi else 1e-5  # To avoid division by zero

  scores = []
  for i in range(5):
      if prices[i] == mi:
          score = round(abs(ratings[i] / 1e-5), 2)  # Avoid division by zero
      else:
          score = round(abs(ratings[i] / ((prices[i] - mi) / range_price)), 2)
      scores.append(score)
  # Step 3: Assign random vendor names
  vendor_names = ["eBay", "Flipkart", "Amazon", "Walmart", "BestBuy", "Newegg", "Costco", "Alibaba", "Snapdeal", "Rakuten", "Shopify", "Etsy", "MercadoLibre", "JD", "Zalando", "Overstock", "Wayfair", "Lazada", "ASOS"]
  vendors = [{'vendor': random.choice(vendor_names), 'score': scores[i], 'price': prices[i], 'rating': ratings[i]} for i in range(5)]

  # Step 4: Identify the vendor with the highest score
  best_vendor = max(vendors, key=lambda x: x['score'])

  # Step 5: Construct the recommendation
  recommendation = f"We suggest you to buy from vendor {best_vendor['vendor']} as they have: \n " \
                  f"1. Best rating of {best_vendor['rating']} \n " \
                  f"2. Affordable price of {best_vendor['price']} \n " \
                  f"3. Fastest delivery speed"

  # Step 6: Construct the vendor details in the specified format and sort in reverse order
  vendors_sorted = sorted(vendors, key=lambda x: x['score'], reverse=True)
  vendor_details = ", ".join([f"{vendor['vendor']}=({vendor['score']},{vendor['price']},{vendor['rating']})" for vendor in vendors_sorted])
  datasets.append({ "instruction": 'Recommend the vendor based on given data. '+vendor_details, "output": recommendation})

with open("d.json", "w", encoding="utf-8") as f:
    json.dump(datasets, f, indent=4, ensure_ascii=False)

print(len(datasets))
print(datasets[-1])

6000
{'instruction': 'Recommend the vendor based on given data. Newegg=(490000.0,996,4.9), Wayfair=(12.65,2490,4.7), Flipkart=(5.24,3450,3.2), Amazon=(4.42,5000,4.4), Zalando=(2.5,5017,2.5)', 'output': 'We suggest you to buy from vendor Newegg as they have: \n 1. Best rating of 4.9 \n 2. Affordable price of 996 \n 3. Fastest delivery speed'}


In [None]:
import json
import random

# List of countries and their fictional exchange rates to INR
countries = {
    'USA': {'currency': 'USD', 'rate': 73},
    'UK': {'currency': 'GBP', 'rate': 102},
    'Singapore': {'currency': 'SGD', 'rate': 54},
    'Japan': {'currency': 'JPY', 'rate': 0.67},
    'Thailand': {'currency': 'THB', 'rate': 2.3},
    'Indonesia': {'currency': 'IDR', 'rate': 0.0051},
    'Germany': {'currency': 'EUR', 'rate': 85},
    'France': {'currency': 'EUR', 'rate': 85},
    'Canada': {'currency': 'CAD', 'rate': 58},
    'Australia': {'currency': 'AUD', 'rate': 55}
}

# Function to generate a random price
def generate_random_price():
    return round(random.uniform(50, 1000), 2)

# Function to calculate the final price in local currency and INR
def calculate_final_price(base_price, delivery_cost, tax, exchange_rate):
    total_price_local = base_price + delivery_cost + tax
    total_price_inr = total_price_local * exchange_rate
    return total_price_local, total_price_inr
# Generate the dataset
for _ in range(1000):
    country = random.choice(list(countries.keys()))
    currency = countries[country]['currency']
    exchange_rate = countries[country]['rate']
    base_price = generate_random_price()
    delivery_cost = round(base_price * 0.1, 2)  # 10% of base price
    tax = round(base_price * 0.17, 2)  # 17% of base price
    total_price_local, total_price_inr = calculate_final_price(base_price, delivery_cost, tax, exchange_rate)

    entry = {
        "instruction": f"Provide me the delivery details based on the input. Country={country}, base price= {base_price} {currency}, Delivery cost= {delivery_cost} {currency}, tax= {tax} {currency},total_price_local={total_price_local} {currency},total_price_inr={total_price_inr}",
        "output": f"The expected on-hand product price from {country}, including delivery and tax, is {base_price} {currency} (base price) + {delivery_cost} {currency} (delivery cost) + {tax} {currency} (Custom Duty tax of 17%), resulting in a total of {total_price_local} {currency}, which equals {total_price_inr} INR."
    }
    datasets.append(entry)

with open("datasets.json", "w", encoding="utf-8") as f:
    json.dump(datasets, f, indent=4,ensure_ascii=False)
print(len(datasets))
print(datasets[-1])



7000
{'instruction': 'Provide me the delivery details based on the input. Country=Thailand, base price= 894.68 THB, Delivery cost= 89.47 THB, tax= 152.1 THB,total_price_local=1136.25 THB,total_price_inr=2613.375', 'output': 'The expected on-hand product price from Thailand, including delivery and tax, is 894.68 THB (base price) + 89.47 THB (delivery cost) + 152.1 THB (Custom Duty tax of 17%), resulting in a total of 1136.25 THB, which equals 2613.375 INR.'}


In [None]:
import json

# Define the countries and product names
countries = [
    "Australia",
    "Canada",
    "China",
    "France",
    "Germany",
    "Indonesia",
    "Japan",
    "Thailand",
    "United Kingdom",
    "United States"
]

product_names = [
    'iPhone 14 Pro Max',
    'Huawei Mate 50 Pro',
    'Huawei Mate Xs 2',
    'Huawei Nova 10 Pro',
    'Sony Xperia 1 IV',
    'Sony Xperia 5 IV',
    'Sony Xperia Pro-I',
    'LG G8X ThinQ',
    'Nokia Lumia 635',
    'Motorola Edge 30 Ultra',
    'Motorola Moto G82',
    'ASUS ROG Phone 6',
    'Lenovo Legion Phone Duel 3',
    'Apple iPad mini (2021)',
    'Samsung Galaxy Tab S8 Ultra'
]

# Function to generate random prices for all countries
def generate_prices(countries):
    return {country: random.randint(10000, 50000) for country in countries}

# Create a list to store the data

# Generate 1000 rows of data
for _ in range(1000):
    prices = generate_prices(countries)
    product_name = random.choice(product_names)

    # Randomly decide whether to compare two or three countries
    num_countries = random.choice([2, 3])
    selected_countries = random.sample(countries, num_countries)

    # Formulate the question string
    prices_str = ', '.join([f'{country}= ₹{price}' for country, price in prices.items()])
    country_list = ' or '.join(selected_countries)
    question = f"Which country has the minimum cost for {product_name}, {country_list}? The Indian value of {prices_str}."

    # Determine the country with the minimum price among the selected countries
    min_country = min(selected_countries, key=lambda country: prices[country])

    # Formulate the answer string
    answer = f"{min_country} has the minimum cost for {product_name}."

    # Add the data to the dataset
    datasets.append({"instruction": question, "output": answer})

# Save the dataset to a JSON file
with open("datasets.json", "w", encoding="utf-8") as f:
    json.dump(datasets, f, indent=4, ensure_ascii=False)

# Print the length and last entry of the dataset
print(len(datasets))
print(datasets[-1])


8000
{'instruction': 'Which country has the minimum cost for Huawei Nova 10 Pro, United States or Canada? The Indian value of Australia= ₹20166, Canada= ₹18166, China= ₹37160, France= ₹19156, Germany= ₹44117, Indonesia= ₹19154, Japan= ₹30181, Thailand= ₹12509, United Kingdom= ₹27721, United States= ₹29568.', 'output': 'Canada has the minimum cost for Huawei Nova 10 Pro.'}


In [None]:
import json
import random

# Define the countries and product names
countries = [
    "Australia",
    "Canada",
    "China",
    "France",
    "Germany",
    "Indonesia",
    "Japan",
    "Thailand",
    "United Kingdom",
    "United States",
    "India"  # Ensure India is included in the countries list
]

product_names = [
    'Huawei Mate 50 Pro',
    'Huawei Nova 10 Pro',
    'Motorola Moto G82'
]

# Function to generate random prices for all countries
def generate_prices(countries):
    return {country: random.randint(10000, 50000) for country in countries}

# Create a list to store the data

# Generate 1000 rows of data
for _ in range(1000):
    prices = generate_prices(countries)
    product_name = random.choice(product_names)

    # Randomly select a country for comparison
    country_name = random.choice(countries)

    # Ensure 'India' is included in the countries list
    if 'India' not in prices:
        continue  # Skip this iteration if India is not in the prices dictionary

    india_product_price = prices['India']
    other_country_price = prices[country_name]

    # Calculate percentage difference
    savings_percentage = ((india_product_price - other_country_price) / india_product_price) * 100

    # Formulate the recommendation based on price comparison
    if other_country_price < india_product_price:
        recommendation = f"It is cheaper to purchase {product_name} in {country_name} at a cost of {other_country_price} compared to India at {india_product_price}. You save {savings_percentage:.2f}%."
    else:
        recommendation = f"It is cheaper to purchase {product_name} in India at a cost of {india_product_price} than in {country_name}."

    # Formulate the question string
    question = f"Suggest the best country in the bill to buy the products in the bill based on the data. Product Data: {product_name}, country={country_name}, India price=₹{india_product_price}, country price=₹{other_country_price}, percentage difference={savings_percentage:.2f}%."

    # Add the data to the dataset
    datasets.append({"instruction": question, "output": recommendation})

# Save the dataset to a JSON file
with open("datasets.json", "w", encoding="utf-8") as f:
    json.dump(datasets, f, indent=4, ensure_ascii=False)

# Print the length and last entry of the dataset
print(len(datasets))
print(datasets[-1])


9000
{'instruction': 'Suggest the best country in the bill to buy the products in the bill based on the data. Product Data: Huawei Nova 10 Pro, country=United Kingdom, India price=₹38896, country price=₹13673, percentage difference=64.85%.', 'output': 'It is cheaper to purchase Huawei Nova 10 Pro in United Kingdom at a cost of 13673 compared to India at 38896. You save 64.85%.'}


In [None]:
import random
import json
# Define product categories and price ranges
product_categories = ["Electronics", "Furniture", "Clothing", "Appliances", "Tools"]
price_ranges = {
    "Electronics": (10, 1000),
    "Furniture": (50, 1000),
    "Clothing": (10, 200),
    "Appliances": (100, 500),
    "Tools": (20, 200),
}
terms = [
  "Payment is due within 15 days.",
  "Please make checks payable to: East Repair Inc.",
  "Scope of Services: We will provide the following services (clearly outline services offered).",
  "Warranties and Disclaimers: We warranty our work for X days/months (specify warranty period, if any). We are not liable for damages beyond our control.",
  "Term and Termination: This agreement is valid for X period (specify duration) and can be terminated by either party for cause (specify reasons).",
  "Limitation of Liability: Our liability for any damages is limited to Y (specify maximum liability amount).",
  "Confidentiality: All confidential information will be kept confidential by both parties.",
  "Force Majeure: Acts of God or other unforeseen events beyond our control excuse us from fulfilling our obligations under this agreement.",
  "Dispute Resolution: Any disagreements will be resolved through mediation (or arbitration, if preferred).",
  "Governing Law: This agreement is subject to the laws of the State .",
]

# Function to generate a random product name
def generate_product_name(category):
  nouns = {
      "Electronics": ["laptop", "phone", "television", "speaker", "headphones"],
      "Furniture": ["chair", "table",  "sofa", "bed", "dresser"],
      "Clothing": ["shirt", "pants", "dress", "shoes", "jacket"],
      "Appliances": ["refrigerator", "washing machine", "dryer", "toaster", "blender"],
      "Tools": ["hammer", "saw", "drill", "screwdriver", "wrench"],
  }
  return random.choice(nouns[category]) + " " + random.choice(["heavy-duty", "compact", "portable", "wireless", "deluxe"])

# Function to generate a random invoice
def generate_invoice():
  # Generate random invoice details
  invoice_number = f"US-{random.randint(1000, 9999)}"
  invoice_date = f"{random.randint(1, 12)}/{random.randint(1, 28)}/2024"

  customer_name = f"Customer {random.randint(1000, 9999)}"
  customer_address = f"{random.randint(100, 999)} {random.choice(['Main St', 'Elm St', 'Park Ave','church st'])}" + f"{random.choice(['Apt', 'Suite'])} {random.randint(1, 100)}, {random.choice(['New York', 'Los Angeles', 'Chicago'])} {random.randint(10000, 99999)}"

  # Generate random number of items (1-5)
  num_items = random.randint(1, 5)
  items = {}
  total_price = 0

  for i in range(num_items):
    category = random.choice(product_categories)
    product_name = generate_product_name(category)
    price = round(random.uniform(price_ranges[category][0], price_ranges[category][1]),2)
    items[product_name] = price
    total_price += price

  # Calculate sales tax (assuming 6.25%)
  sales_tax = total_price * 0.0625

  # Generate output dictionary
  output = {
      key: value for key, value in items.items()
  }
  output["Total"] = total_price + sales_tax

  return output, f""" INVOICE #{invoice_number} INVOICE DATE {invoice_date} BILL TO {customer_name} {customer_address} QTY DESCRIPTION UNIT PRICE AMOUNT  {"".join([f"{qty} {name} {price:.2f} {qty * price:.2f}" for name, price, qty in zip(items.keys(), items.values(), [1] * num_items)])} Subtotal {total_price:.2f} Sales Tax {sales_tax:.2f} TOTAL {output['Total']:.2f} Terms and conditions {random.choices(terms)[0]}"""

# Get the number of invoices to generate from the user
invoices = []
for _ in range(1000):
  invoice_data, invoice_text = generate_invoice()
  invoices.append(invoice_text)
  datasets.append({"instruction":"Extract the product name ans the price in the form of python dictionary from the given invoice. Invoice:"+ invoice_text, "output": str(invoice_data)})
print(len(datasets)
)# Save the dataset to a JSON file
with open("datasets.json", "w", encoding="utf-8") as f:
    json.dump(datasets, f, indent=4, ensure_ascii=False)
datasets[-1]

10000


{'instruction': 'Extract the product name ans the price in the form of python dictionary from the given invoice. Invoice: INVOICE #US-9571 INVOICE DATE 12/11/2024 BILL TO Customer 2547 168 Elm StApt 26, Chicago 18705 QTY DESCRIPTION UNIT PRICE AMOUNT  1 hammer portable 173.37 173.371 dryer compact 288.58 288.581 speaker compact 980.35 980.35 Subtotal 1442.30 Sales Tax 90.14 TOTAL 1532.44 Terms and conditions Please make checks payable to: East Repair Inc.',
 'output': "{'hammer portable': 173.37, 'dryer compact': 288.58, 'speaker compact': 980.35, 'Total': 1532.44375}"}

In [None]:
datasets[-1]

{'instruction': 'Extract the product name ans the price in the form of python dictionary from the given invoice. Invoice: INVOICE #US-2060 INVOICE DATE 10/4/2024 BILL TO Customer 5495 129 church stSuite 11, Chicago 46810 QTY DESCRIPTION UNIT PRICE AMOUNT  1 dresser wireless 102.42 102.421 toaster heavy-duty 434.11 434.111 drill compact 30.46 30.46 Subtotal 566.99 Sales Tax 35.44 TOTAL 602.43 Terms and conditions Confidentiality: All confidential information will be kept confidential by both parties.',
 'output': "{'dresser wireless': 102.42, 'toaster heavy-duty': 434.11, 'drill compact': 30.46, 'Total': 602.426875}"}

In [None]:
model.save_pretrained("sap_model")
tokenizer.save_pretrained("sap_model")
model.push_to_hub("scientisthere/sap_currency_conversion_usecase", token = "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") # Online saving
tokenizer.push_to_hub("scientisthere/sap_currency_conversion_usecase", token = "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") # Online saving