# Data Extraction

Load JSON Data

In [1]:
# !pip install pandas

In [2]:
import json

# Load the JSON file
with open('../data/trades 1.json', 'r') as json_file:
    trade_data = json.load(json_file)


# Check if trade_data is a dictionary and print its keys or a specific part
if isinstance(trade_data, dict):
    print("The JSON data is a dictionary. Here are its keys:")
    print(list(trade_data.keys()))
    # Optionally, print a part of the dictionary if you know the key names
    # Example: print(json.dumps(trade_data['a_specific_key'], indent=2))
elif isinstance(trade_data, list):
    # If trade_data is a list, safely print the first few elements
    print(json.dumps(trade_data[:2], indent=2))
else:
    print("The loaded JSON data is neither a list nor a dictionary.")

The JSON data is a dictionary. Here are its keys:
['products']


In [3]:
# Access the product data
product_data = trade_data['products']
# Print the structure of the first few products
print(json.dumps(product_data[:2], indent=2))

[
  {
    "productCode": "WSD1084623",
    "couponRate": 5,
    "couponFrequency": "annual",
    "couponType": "conditional",
    "upsideStyle": "par",
    "downsideStyle": "vanillaPut",
    "putStrike": 80
  },
  {
    "productCode": "WSD19837233",
    "couponRate": 6,
    "autocall": "no",
    "issuerCall": "no",
    "couponFrequency": "monthly",
    "couponType": "conditional",
    "upsideStyle": "par",
    "downsideStyle": "vanillaPut",
    "putStrike": 75
  }
]


Load CSV Data

In [4]:
import pandas as pd

# Load the Excel file
category_data = pd.read_excel('../data/categories 1.xlsx')

# Explore the data (print the first few rows to understand its structure)
print(category_data.head())


           ISIN     type
0    WSD1084623  phoenix
1   WSD19837233  phoenix
2   WSD86345633  phoenix
3   WSD56734563   revcon
4  WSD348570345  phoenix


# Convert JSON Product Data into a DataFrame

In [5]:
products_df = pd.DataFrame(product_data)
print(products_df.head())

    productCode  couponRate couponFrequency   couponType upsideStyle  \
0    WSD1084623         5.0          annual  conditional         par   
1   WSD19837233         6.0         monthly  conditional         par   
2   WSD86345633         4.5       quarterly  conditional         par   
3   WSD56734563         5.4       quarterly   guaranteed         par   
4  WSD348570345         6.0          annual  conditional         par   

  downsideStyle  putStrike autocall issuerCall  digitalCoupon  kiBarrier  
0    vanillaPut         80      NaN        NaN            NaN        NaN  
1    vanillaPut         75       no         no            NaN        NaN  
2    vanillaPut        100       no         no            NaN        NaN  
3    vanillaPut         90      NaN        yes            NaN        NaN  
4    vanillaPut         80      yes        NaN            NaN        NaN  


# Merge DataFrames

In [6]:
final_dataset = pd.merge(products_df, category_data, left_on='productCode', right_on='ISIN', how='inner')

print(final_dataset.head())

   productCode  couponRate couponFrequency   couponType upsideStyle  \
0   WSD1084623         5.0          annual  conditional         par   
1   WSD1084623         5.0          annual  conditional         par   
2   WSD1084623         7.9          annual   guaranteed         par   
3   WSD1084623         7.9          annual   guaranteed         par   
4  WSD19837233         6.0         monthly  conditional         par   

  downsideStyle  putStrike autocall issuerCall  digitalCoupon  kiBarrier  \
0    vanillaPut         80      NaN        NaN            NaN        NaN   
1    vanillaPut         80      NaN        NaN            NaN        NaN   
2    vanillaPut         70      yes        NaN            NaN        NaN   
3    vanillaPut         70      yes        NaN            NaN        NaN   
4    vanillaPut         75       no         no            NaN        NaN   

          ISIN     type  
0   WSD1084623  phoenix  
1   WSD1084623   revcon  
2   WSD1084623  phoenix  
3   WSD10846

# Dialogue Creation Function

In [7]:
def create_dialogue(row):
    # Simulate a user describing a product
    user_message = f"I'm interested in a product with these features: rate {row['couponRate']}, frequency {row['couponFrequency']}, type {row['couponType']}, and style {row['upsideStyle']}. What category does it belong to?"

    # Simulate the bot response with the product category
    bot_message = f"This product falls under the '{row['type']}' category."

    return {"user": user_message, "bot": bot_message}


# Apply the Function to Each Row in the Dataset

In [8]:
# Generate dialogues
dialogues = [create_dialogue(row) for _, row in final_dataset.iterrows()]

# Print some examples to verify
for dialogue in dialogues[:5]:
    print("User:", dialogue["user"])
    print("Bot:", dialogue["bot"])
    print("---")


User: I'm interested in a product with these features: rate 5.0, frequency annual, type conditional, and style par. What category does it belong to?
Bot: This product falls under the 'phoenix' category.
---
User: I'm interested in a product with these features: rate 5.0, frequency annual, type conditional, and style par. What category does it belong to?
Bot: This product falls under the 'revcon' category.
---
User: I'm interested in a product with these features: rate 7.9, frequency annual, type guaranteed, and style par. What category does it belong to?
Bot: This product falls under the 'phoenix' category.
---
User: I'm interested in a product with these features: rate 7.9, frequency annual, type guaranteed, and style par. What category does it belong to?
Bot: This product falls under the 'revcon' category.
---
User: I'm interested in a product with these features: rate 6.0, frequency monthly, type conditional, and style par. What category does it belong to?
Bot: This product falls un

In [9]:
# Save the Dialogues to a File (Optional)
import json

# Save dialogues to a JSON file
with open('../data/training_data.json', 'w') as file:
    json.dump(dialogues, file, indent=4)

print("Dialogues have been saved to training_data.json.")


Dialogues have been saved to training_data.json.


# Bonus Features

# Variability in User Requests

In [10]:
import random

# Example of synonyms for product features
feature_variations = {
    "couponRate": ["interest rate", "coupon rate", "rate"],
    "couponFrequency": ["payment frequency", "coupon frequency", "frequency"],
    # Add more features and their variations here
}

def choose_variation(feature):
    return random.choice(feature_variations.get(feature, [feature]))


# Dynamic Product Naming

In [11]:
def dynamic_product_name(row):
    prefix = ""
    if row.get('autocall') == 'yes':
        prefix += "Autocallable "
    elif row.get('issuerCall') == 'yes':
        prefix += "Issuer Callable "
    return prefix + "Product"


# Follow-Up Questions

In [12]:
essential_features = ['couponRate', 'couponFrequency', 'couponType', 'upsideStyle']


In [13]:
def create_dialogue_with_followups(row):
    missing_features = [feature for feature in essential_features if pd.isnull(row[feature]) or row[feature] == '']
    if missing_features:
        # Generate a follow-up question for the first missing feature
        followup = f"Could you provide more details on the {missing_features[0]}?"
        return {
            "user": "I'm looking for a product but don't have all the details.",
            "bot": followup
        }
    else:
        # Proceed with normal dialogue if no essential features are missing
        return {
            "user": f"I'm interested in a product with a rate of {row['couponRate']} and frequency of {row['couponFrequency']}.",
            "bot": f"This product falls under the '{row['type']}' category."
        }


In [22]:
# Assume 'final_dataset' is your merged DataFrame ready for dialogue generation
enhanced_dialogues = [create_dialogue_with_followups(row) for _, row in final_dataset.iterrows()]
enhanced_dialogues

[{'user': "I'm interested in a product with a rate of 5.0 and frequency of annual.",
  'bot': "This product falls under the 'phoenix' category."},
 {'user': "I'm interested in a product with a rate of 5.0 and frequency of annual.",
  'bot': "This product falls under the 'revcon' category."},
 {'user': "I'm interested in a product with a rate of 7.9 and frequency of annual.",
  'bot': "This product falls under the 'phoenix' category."},
 {'user': "I'm interested in a product with a rate of 7.9 and frequency of annual.",
  'bot': "This product falls under the 'revcon' category."},
 {'user': "I'm interested in a product with a rate of 6.0 and frequency of monthly.",
  'bot': "This product falls under the 'phoenix' category."},
 {'user': "I'm interested in a product with a rate of 4.5 and frequency of quarterly.",
  'bot': "This product falls under the 'phoenix' category."},
 {'user': "I'm interested in a product with a rate of 5.4 and frequency of quarterly.",
  'bot': "This product falls

In [23]:
# Export Final Dialogues:
import json

with open('../data/enhanced_training_data.json', 'w') as file:
    json.dump(enhanced_dialogues, file, indent=4)

print("Enhanced dialogues have been saved to enhanced_training_data.json.")


Enhanced dialogues have been saved to enhanced_training_data.json.


# Model Train and Test

In [24]:
from sklearn.model_selection import train_test_split

# Assuming 'enhanced_dialogues' is loaded from 'enhanced_training_data.json'
train_val, test = train_test_split(enhanced_dialogues, test_size=0.1, random_state=42)
train, val = train_test_split(train_val, test_size=0.1, random_state=42)


In [25]:
# This is a placeholder example for training a model with PyTorch or TensorFlow
model = YourModel()  # This would be your LLaMA or similar model
optimizer = ...  # Choose an optimizer
loss_function = ...  # Define a loss function

for epoch in range(num_epochs):
    for dialogues in train_loader:  # You'd create a DataLoader or equivalent for your dataset
        predictions = model(dialogues)
        loss = loss_function(predictions, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # Evaluate on validation set


NameError: name 'YourModel' is not defined

In [20]:
# # Download Notebook as PDF
# !jupyter nbconvert --to pdf WSD-Assessment.ipynb