# Task 1 - Dataset Preprocessing

- *Visualize basic details of the Dataset*
- *Detect unnecessary Entities in the Dataset*
- *Remove those Entities from the Dataset*

In [1]:
# Ensuring GPU is detected
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

True
0
NVIDIA GeForce RTX 4070 Ti


In [2]:
# Import all needed libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
from transformers import MarianMTModel, MarianTokenizer
from tqdm import tqdm
import warnings

# Ignore all future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
# Loading the dataset
df = pd.read_csv('data/Bitext_Sample_Customer_Support_Training_Dataset.csv')

In [4]:
# Displaying basic information about the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26872 entries, 0 to 26871
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   flags        26872 non-null  object
 1   instruction  26872 non-null  object
 2   category     26872 non-null  object
 3   intent       26872 non-null  object
 4   response     26872 non-null  object
dtypes: object(5)
memory usage: 1.0+ MB
None


In [5]:
# Seeing how the first few rows look
print(df.head())

   flags                                        instruction category  \
0      B   question about cancelling order {{Order Number}}    ORDER   
1    BQZ  i have a question about cancelling oorder {{Or...    ORDER   
2   BLQZ    i need help cancelling puchase {{Order Number}}    ORDER   
3     BL         I need to cancel purchase {{Order Number}}    ORDER   
4  BCELN  I cannot afford this order, cancel purchase {{...    ORDER   

         intent                                           response  
0  cancel_order  I've understood you have a question regarding ...  
1  cancel_order  I've been informed that you have a question ab...  
2  cancel_order  I can sense that you're seeking assistance wit...  
3  cancel_order  I understood that you need assistance with can...  
4  cancel_order  I'm sensitive to the fact that you're facing f...  


In [6]:
# Finding all the entities from the dataset and grouping them in dictionaries

nested_entities_replacements = {
    r"{{Account Type}}": "Premium",
    r"{{Account Category}}": "Gold",
}

entity_replacements = {
    r"{{Order Number}}": "ORD12345",
    r"{{Invoice Number}}": "INV56789",
    r"{{Online Order Interaction}}": "order history",
    r"{{Online Customer Support Channel}}": "support channel",
    r"{{Profile}}": "user profile",
    r"{{Profile Type}}": "standard",
    r"{{Settings}}": "account settings",
    r"{{Online Company Portal Info}}": "company portal",
    r"{{Date}}": "2023-01-01",
    r"{{Date Range}}": "2023-01-01 to 2023-01-31",
    r"{{Shipping Cut-off Time}}": "5:00 PM",
    r"{{Delivery City}}": "New York",
    r"{{Delivery Country}}": "USA",
    r"{{Salutation}}": "Dear Customer",
    r"{{Client First Name}}": "John",
    r"{{Client Last Name}}": "Doe",
    r"{{Customer Support Phone Number}}": "123-456-7890",
    r"{{Customer Support Email}}": "support@example.com",
    r"{{Live Chat Support}}": "Live Chat",
    r"{{Website URL}}": "www.example.com",
    r"{{Upgrade Account}}": "Upgrade to Premium",
    r"{{Account Change}}": "Account Update",
    r"{{Refund Amount}}": "$50",
    r"{{Money Amount}}": "$100",
    r"{{Store Location}}": "123 Main Street New York",
    r"{{Customer Support Hours}}": "9 AM - 5 PM",
    r"{{Forgot PIN code}}": "1234",
    r"{{Client Name}}": "John Doe",
    r"{{Customer Support Team Name}}": "Customer Support Team",
    r"{{Basic}}": "Basic",
    r"{{Billing Category}}": "Billing Category A",
    r"{{Customer Support Start Time}}": "9 AM",
    r"{{Change Key}}": "Key123",
    r"{{Order/Transaction/Reference Number}}": "TRX67890",
    r"{{Manage Subscription}}": "Subscription Management",
    r"{{Password Recovery Page}}": "Password Recovery Page URL",
    r"{{Account Number}}": "ACC123456",
    r"{{Key Recovery Page URL}}": "www.example.com/key-recovery",
    r"{{additional details}}": "additional details",
    r"{{Reset}}": "Reset",
    r"{{Purchase Status}}": "Completed",
    r"{{Store Pickup Time}}": "3 PM",
    r"{{Company Name}}": "Example Company",
    r"{{Month}}": "January",
    r"{{Purchase Details}}": "Purchase Details",
    r"{{Submit}}": "Submit",
    r"{{Restitution ID}}": "REST9876",
    r"{{Lost Key}}": "LostKey456",
    r"{{Social Media Handles}}": "@example_social",
    r"{{Fourth Step}}": "Step 4",
    r"{{Forgot Your Access Key}}": "AccessKey123",
    r"{{Recover Key}}": "RecoverKey789",
    r"{{Confirm}}": "Confirm",
    r"{{Change User}}": "User123",
    r"{{Password Recovery Page URL}}": "www.example.com/password-recovery",
    r"{{PIN Recovery Page URL}}": "www.example.com/pin-recovery",
    r"{{PIN Retrieval}}": "PIN1234",
    r"{{Explanation of Feature 3}}": "Explanation of Feature 3",
    r"{{Year}}": "2023",
    r"{{Forgot Account Access Key}}": "AccessKey123",
    r"{{Explanation of Feature 1}}": "Explanation of Feature 1",
    r"{{Claims Department}}": "Claims Department",
    r"{{Basic Account}}": "Basic Account",
    r"{{Rebate Tracking Number}}": "REB123456",
    r"{{Rebate Number}}": "REB123456",
    r"{{Compensation Type}}": "Compensation Type A",
    r"{{Update Profile}}": "Profile Update",
    r"{{Business Hours}}": "9 AM - 5 PM",
    r"{{My Purchases}}": "Purchase History",
    r"{{Order/Invoice Number}}": "ORD56789",
    r"{{Delivery Time}}": "3 Days",
    r"{{Online Store}}": "Online Store",
    r"{{Second Step}}": "Step 2",
    r"{{Profile Settings}}": "Profile Settings",
    r"{{Sign Up}}": "Sign Up",
    r"{{Upgrade or Downgrade Account}}": "Upgrade/Downgrade Account",
    r"{{Toll-Free Number}}": "1-800-123-456",
    r"{{Forgot Profile Key}}": "ProfileKey456",
    r"{{Confirm Cancellation}}": "Cancellation Confirmed",
    r"{{Navigation steps}}": "Navigation Steps",
    r"{{Downgrade}}": "Downgrade",
    r"{{PIN Code Restoration}}": "PIN Restore",
    r"{{Salutation}}": "Dear Customer",
    r"{{Account Name}}": "Account Name ABC",
    r"{{Dashboard}}": "Dashboard",
    r"{{User Account Recovery}}": "User Account Recovery",
    r"{{Restore Access Key}}": "RestoreKey456",
    r"{{Change Access Key}}": "AccessKey789",
    r"{{Access Key}}": "Key123",
    r"{{Username}}": "Username123",
    r"{{User Profile}}": "User Profile",
    r"{{Review Platform 1}}": "Platform 1 Review",
    r"{{Support Phone Number}}": "1-800-123-456",
    r"{{Customer Support Contact Number}}": "1-800-123-456",
    r"{{PIN Code Reset}}": "PIN Reset",
    r"{{Completion steps}}": "Completion Steps",
    r"{{Create Account}}": "Create Account",
    r"{{Membership Type}}": "Gold",
    r"{{Reimbursement Request Number}}": "REQ123456",
    r"{{Account Recovery Page URL}}": "www.example.com/account-recovery",
    r"{{Live Chat Feature}}": "Live Chat",
    r"{{Customer Assistance Email}}": "support@example.com",
    r"{{Date of the Invoice}}": "2023-01-10",
    r"{{Delivery Country}}": "USA",
    r"{{Min Delivery Time}}": "2 Days",
    r"{{Feedback Email Address}}": "feedback@example.com",
    r"{{Profile PIN Recovery}}": "PIN recovery page",
    r"{{Forgot Key}}": "Forgotten key procedure",
    r"{{Company Support Channels}}": "support channels",
    r"{{Shipping Address}}": "123 Main Street, New York",
    r"{{Reset PIN}}": "reset your PIN here",
    r"{{Standard Shipping Time}}": "5-7 business days",
    r"{{Payment Issue Email}}": "paymentissues@example.com",
    r"{{Same-Day Order Time}}": "order before 3 PM for same-day delivery",
    r"{{Security & Privacy}}": "security and privacy page",
    r"{{Expedited Shipping Days}}": "2-3 business days",
    r"{{year}}": "2024",
    r"{{Recover PIN}}": "recover your PIN here",
    r"{{Account Details}}": "account details section",
    r"{{Login page URL}}": "www.example.com/login",
    r"{{Customer Assistance Phone Number}}": "123-456-7890",
    r"{{Company Account}}": "company12345",
    r"{{Review Platform}}": "platform for reviews",
    r"{{Switch User}}": "Switch to a different user",
    r"{{Max Delivery Time}}": "maximum 7 days",
    r"{{Closing Time}}": "closing at 6 PM",
    r"{{Review Platform 2}}": "alternative review platform",
    r"{{Account Closure Timeframe}}": "closure in 30 days",
    r"{{PIN}}": "1234",
    r"{{Expedited Delivery Time}}": "2-3 business days",
    r"{{Reimbursement ID}}": "RMB1234567",
    r"{{User Profile Settings}}": "user profile settings page",
    r"{{User Key Retrieval}}": "retrieve key via email",
    r"{{Feature 1}}": "first feature details",
    r"{{Reset Pin Code}}": "reset your PIN code",
    r"{{Unauthorized Charges Refund Time}}": "refund in 7-10 days",
    r"{{Refund Policy}}": "refund policy page",
    r"{{Product Reviews Email}}": "reviews@example.com",
    r"{{Reset Access Key}}": "reset access key procedure",
    r"{{User Profile Page URL}}": "www.example.com/profile",
    r"{{Social Media Platform}}": "Facebook, Twitter",
    r"{{PIN Code Retrieval}}": "retrieve PIN code here",
    r"{{Regular Profile}}": "regular user profile",
    r"{{Express Delivery Time}}": "1-2 business days",
    r"{{Order Tracking Method}}": "order tracking via email",
    r"{{Customer Support Days}}": "Mon-Fri",
    r"{{Customer Support Live Chat URL}}": "www.example.com/livechat",
    r"{{Stolen User Key}}": "stolen key reporting procedure",
    r"{{Contact Page URL}}": "www.example.com/contact",
    r"{{Customer Service Toll-Free Number}}": "1-800-555-1234",
    r"{{Forgot Password}}": "password recovery page",
    r"{{Order Status}}": "order status page",
    r"{{Cancellation Policy}}": "cancellation policy details",
    r"{{PIN Code Recovery}}": "recover PIN code here",
    r"{{Security}}": "security settings page",
    r"{{Change Password}}": "password change page",
    r"{{Company Support Page URL}}": "www.example.com/support",
    r"{{Access Key Recovery}}": "access key recovery page",
    r"{{Feature 2}}": "second feature details",
    r"{{Password Reset}}": "password reset page",
    r"{{Access Key Retrieval}}": "retrieve access key here",
    r"{{Business Name Anonymized}}": "Business Inc.",
    r"{{Claims Website URL}}": "www.example.com/claims",
    r"{{Customer Assistance Hours}}": "9 AM - 6 PM",
    r"{{the reason for your call}}": "order issues, account help",
    r"{{Profile Recovery}}": "profile recovery page",
    r"{{E-commerce Platform Names}}": "Amazon, eBay",
    r"{{Full Name}}": "John Doe",
    r"{{Contact Channel}}": "phone, email, live chat",
    r"{{PIN Reset Page URL}}": "www.example.com/pinreset",
    r"{{Carrier Name}}": "UPS, FedEx",
    r"{{provide step-by-step instructions}}": "step-by-step guide here",
    r"{{Restore User Access Key}}": "restore access key process",
    r"{{Registration Support Phone Number}}": "1-800-555-9876",
    r"{{Reset Key}}": "reset key page",
    r"{{Forgot PIN Code}}": "forgot PIN recovery page",
    r"{{PIN Retrieval Page URL}}": "www.example.com/pinretrieval",
    r"{{Product Category}}": "electronics, clothing, groceries",
    r"{{Express Shipping Days}}": "1-2 days",
    r"{{First Step}}": "first step in the process",
    r"{{Login Page}}": "www.example.com/login",
    r"{{Privacy}}": "privacy policy details",
    r"{{Platform URL}}": "www.example.com/platform",
    r"{{Account Key Recovery}}": "recover account key here",
    r"{{Retrieve Account PIN}}": "retrieve PIN page",
    r"{{Order/Transaction ID}}": "TRX12345678",
    r"{{Customer Support Closing Time}}": "closing time at 6 PM",
    r"{{Tracking Number}}": "TRK123456789",
    r"{{Store Address}}": "456 Elm Street, Los Angeles",
    r"{{Account Type Switch}}": "switch between Basic and Premium",
    r"{{PIN Settings}}": "configure PIN settings",
    r"{{Rebate ID}}": "REB12345678",
    r"{{Customer Support Hotline}}": "1-800-555-5678",
    r"{{the specific topic you need assistance with}}": "account help, order issues",
    r"{{Fifth Step}}": "fifth step in the process",
    r"{{Restore PIN Code}}": "restore PIN here",
    r"{{month}}": "January",
    r"{{Customer Support Website}}": "www.example.com/support",
    r"{{User Key Retrieval Page URL}}": "www.example.com/keyretrieval",
    r"{{Choose Account Type}}": "choose Basic or Premium account",
    r"{{Forgot Pin Code}}": "forgot PIN recovery",
    r"{{Complaint Hotline Number}}": "1-800-555-1212",
    r"{{Forgot User Account Key}}": "forgot account key recovery",
    r"{{Consumer Complaint Email Address}}": "complaints@example.com",
    r"{{order number}}": "ORD12345678",
    r"{{Reset Account Key}}": "reset account key page",
    r"{{Retrieve Profile Key}}": "retrieve profile key here",
    r"{{Help Center}}": "help center page",
    r"{{Forgot PIN}}": "forgot PIN page",
    r"{{Return Policy}}": "return policy details",
    r"{{Cut Off Time}}": "cutoff at 5 PM",
    r"{{Explanation of Feature 2}}": "details about second feature",
    r"{{Compensation Identifier}}": "COMP123456",
    r"{{Company Phone Number}}": "1-800-555-1212",
    r"{{Shipping Status}}": "track shipping status here",
    r"{{Delivery Date}}": "expected delivery date: 2023-10-03",
    r"{{Claims Contact Number}}": "1-800-555-1313",
    r"{{Update Account}}": "update account page",
    r"{{Customer Service Email}}": "customerservice@example.com",
    r"{{Purchase History}}": "purchase history page",
    r"{{Forgot Access Key}}": "forgot access key page",
    r"{{Shipment Tracking Number}}": "TRK987654321",
    r"{{CompanyName}}": "Acme Corp",
    r"{{Change PIN}}": "change PIN here",
    r"{{Login URL}}": "www.example.com/login",
    r"{{Order/Claim/Compensation}}": "ORD12345, CLAIM98765, COMP123",
    r"{{Order/Refund/Transaction}}": "ORD98765, REF12345, TRX98765",
    r"{{Customer Service Email Address}}": "support@example.com",
    r"{{contact_method}}": "phone, email, live chat",
    r"{{Tracking Page}}": "tracking page URL",
    r"{{Pin Code Settings}}": "configure PIN code",
    r"{{Help Center URL}}": "www.example.com/help",
    r"{{Express Shipping Time}}": "1-2 business days",
    r"{{Customer Support Team}}": "support team contact details",
    r"{{Order/Claim Number}}": "ORD12345 or CLAIM6789",
    r"{{PIN Recovery}}": "recover your PIN here",
    r"{{Recover Access Key}}": "recover access key here",
    r"{{Track Order}}": "track your order here",
    r"{{Purpose of the Features}}": "feature purposes description",
    r"{{Refund Hotline Number}}": "1-800-555-9090",
    r"{{Save}}": "Save your details",
    r"{{Cancellation Refund Time}}": "refund in 7-10 days",
    r"{{Profile Recovery Page}}": "profile recovery page",
    r"{{Retrieve User Account Key}}": "retrieve account key here",
    r"{{Forgot Account Key}}": "forgot account key procedure",
    r"{{Forgot Pin}}": "recover your PIN here",
    r"{{ETA}}": "estimated time of arrival: 2023-10-03",
    r"{{Complaint Email Address}}": "complaints@example.com",
    r"{{Support Page URL}}": "www.example.com/support",
    r"{{Customer ID}}": "CUST123456",
    r"{{Customer Service Hours}}": "9 AM - 6 PM",
    r"{{Profile Recovery Page URL}}": "www.example.com/profilerecovery",
    r"{{Login steps}}": "login steps details",
    r"{{number of days}}": "3 days",
    r"{{Account}}": "Basic or Premium account",
    r"{{Refund Processing Time}}": "refund in 7-10 days",
    r"{{Third Step}}": "third step in the process",
    r"{{Online Marketplace}}": "Amazon, eBay",
    r"{{Password and Security}}": "password and security page",
    r"{{Create a Secondary Address}}": "create new address page",
    r"{{Customer Support Opening Time}}": "opens at 9 AM",
    r"{{Customer Support Ticket Number}}": "TCK12345678",
    r"{{Zip Code}}": "90210",
    r"{{mention the different contact options available}}": "phone, email, live chat",
    r"{{PIN code}}": "PIN code for account",
    r"{{Refund Helpline Number}}": "1-800-555-1919",
    r"{{Forgot Profile Access Key}}": "forgot profile key page",
    r"{{Security and Privacy}}": "security and privacy page",
    r"{{Switch Plan}}": "switch to a new plan",
    r"{{Remove}}": "remove this item",
    r"{{E-commerce Platform 1}}": "Amazon",
    r"{{Reset Password}}": "reset your password here",
    r"{{Billing History}}": "billing history page",
    r"{{City 3}}": "Miami",
    r"{{customer_support_phone_number}}": "1-800-555-3030",
    r"{{Support Hours}}": "9 AM - 5 PM",
    r"{{Expedited Shipping Time}}": "1-2 business days",
    r"{{Password}}": "password123",
    r"{{Forgot User Access Key}}": "forgot user key page",
    r"{{Tracking Information}}": "tracking details here",
    r"{{Password Reset Page}}": "password reset page",
    r"{{Estimated Delivery Time}}": "estimated delivery: 2023-10-03",
    r"{{Email Address}}": "user@example.com",
    r"{{Retail Stores}}": "physical stores information",
    r"{{Case Number}}": "CASE123456",
    r"{{Opening Time}}": "opens at 9 AM",
    r"{{Feature 3}}": "third feature description",
    r"{{Restore Password}}": "restore password page",
    r"{{Retrieve User Key}}": "retrieve user key here",
    r"{{E-commerce Platform 2}}": "eBay",
    r"{{Reset User Key}}": "reset user key here",
    r"{{Duplicate Charges Refund Time}}": "refund in 7-10 days",
    r"{{Contact Method}}": "phone, email, live chat",
    r"{{Country List}}": "USA, Canada, UK",
    r"{{Retrieve PIN Code}}": "retrieve PIN code page",
    r"{{Add a New Address}}": "add new address here",
    r"{{User Management}}": "user management settings",
    r"{{Working Hours}}": "9 AM - 5 PM",
    r"{{your issue}}": "details of your issue",
    r"{{Number of Days}}": "7 days",
    r"{{Account Page}}": "account page URL",
    r"{{Date of the Bill}}": "2023-09-01",
    r"{{Shipping Addresses}}": "list of addresses",
    r"{{Account Security}}": "account security settings",
    r"{{Account Recovery}}": "account recovery process",
    r"{{Order/Transaction/Reimbursement}}": "TRX12345, REB67890",
    r"{{Company Representative Name}}": "Jane Doe",
    r"{{Country}}": "USA",
    r"{{Free Customer Support Number}}": "1-800-555-2020",
    r"{{Order Tracker}}": "order tracker page",
    r"{{PIN Management}}": "manage your PIN",
    r"{{Retrieve Key}}": "retrieve key here",
    r"{{Retrieve PIN}}": "retrieve your PIN here",
    r"{{Free Account Name}}": "free account details",
    r"{{Access Key Reset Page URL}}": "www.example.com/accesskeyreset",
    r"{{Retrieve Account Key}}": "retrieve account key here",
    r"{{Live Chat}}": "Live Chat feature",
    r"{{Account Upgrade}}": "upgrade your account",
    r"{{Order Tracking}}": "track your order here",
    r"{{Platform Login URL}}": "www.example.com/platformlogin",
    r"{{tracking number}}": "TRK12345",
    r"{{Timeframe}}": "within 5-7 business days",
    r"{{Billing}}": "billing details page",
    r"{{Customer Assistance Email Address}}": "assistance@example.com",
    r"{{PIN Reset}}": "PIN reset page",
    r"{{Track Reimbursement}}": "track reimbursement status",
    r"{{PIN Code}}": "1234",
    r"{{Support Channel 2}}": "alternate support channel",
    r"{{User Settings}}": "user settings page",
    r"{{PIN Code Management}}": "manage your PIN code",
    r"{{Order/Refund/Case Number}}": "ORD12345, REF6789",
    r"{{Password Reset Page URL}}": "www.example.com/passwordreset",
    r"{{Compensation ID}}": "COMP98765",
    r"{{Change Account}}": "change account settings",
    r"{{Key Management}}": "manage your keys",
    r"{{Invoice Name}}": "INV123456",
    r"{{Reference Number}}": "REF12345",
    r"{{Account Closure Process}}": "account closure instructions",
    r"{{Customer Claims Email Address}}": "claims@example.com",
    r"{{Standard Delivery Time}}": "5-7 business days",
    r"{{Contact Us}}": "contact us page",
    r"{{Shipping Method}}": "standard, expedited",
    r"{{Customer Support End Time}}": "support ends at 6 PM",
    r"{{Non-receipt of Goods Refund Time}}": "refund in 7-10 days",
    r"{{Person Name}}": "John Doe",
    r"{{Cancel Purchase}}": "cancel your purchase here",
    r"{{My Account}}": "my account page",
    r"{{Login Page URL}}": "www.example.com/login",
    r"{{City 2}}": "Los Angeles",
    r"{{Customer Support Toll-Free Number}}": "1-800-555-7890",
    r"{{Product Feedback Email}}": "feedback@example.com",
    r"{{Account Access Key Reset}}": "reset your account access key",
    r"{{Account Plan}}": "Premium or Basic plan",
    r"{{Account Management}}": "account management settings",
    r"{{Security Settings}}": "security settings page",
    r"{{Rebate Identifier}}": "REB123456",
    r"{{Forgot User Key}}": "recover your user key",
    r"{{Event Cancellation Refund Time}}": "refund in 7-10 days",
    r"{{Client Full Name}}": "John Doe",
    r"{{Company}}": "Acme Corp",
    r"{{Manage PIN}}": "manage your PIN",
    r"{{Forgot User Profile Key}}": "Forgot user profile access key",
    r"{{Support Channel 1}}": "Primary support channel",
    r"{{Password Management}}": "Password management settings",
    r"{{X-day/money back guarantee period}}": "30-day money-back guarantee",
    r"{{Standard Shipping Days}}": "3-5 business days",
    r"{{City 1}}": "Los Angeles",
    r"{{Password Recovery}}": "Password recovery options",
    r"{{Account ID}}": "ACC12345",
    r"{{Product/Service Name}}": "Product/Service Name",
    r"{{Profile Security}}": "Profile security settings",
    r"{{Destination}}": "Shipping destination",
    r"{{User Account Settings}}": "User account settings",
    r"{{proceed to our website and click on the 'Contact Us' button / dial our customer support number / reach out to our live chat support}}": "Contact customer support via website, phone, or live chat",
    r"{{Restore User Key}}": "Restore user access key",
    r"{{Feedback Email}}": "feedback@example.com",
    r"{{Change Profile}}": "Change user profile settings",
    r"{{Switching option}}": "Switching account options",
    r"{{Account Recovery Page}}": "Account recovery page URL",
    r"{{Support Channel}}": "Customer support channel",
    r"{{Upgrade Account Type}}": "Upgrade account to premium",
    r"{{Payment Issue Phone Number}}": "987-654-3210",
    r"{{Edit PIN}}": "Edit PIN code",
    r"{{Product/Service Defect Refund Time}}": "7-day refund window for defective products",
    r"{{Currency Symbol}}": "$",
    r"{{Choose the Premium profile}}": "Choose the Premium profile",
    r"{{Switch to Gold}}": "Switch to Gold",
    r"{{Upgrade to Premium}}": "Upgrade to Premium",
    r"{{Switch to Premium Account}}": "Switch to Premium Account",
    r"{{Premium Membership}}": "Premium Membership",
    r"{{Gold Upgrade}}": "Gold Upgrade",
}

entity_pattern = re.compile(r"\{\{.*?\}\}")

# Function to find entities in the text
def find_entities(text):
    return entity_pattern.findall(text)

instructions_entities = df['instruction'].apply(find_entities)
responses_entities = df['response'].apply(find_entities)

unique_entities = set([entity.strip() for sublist in instructions_entities for entity in sublist] +
                      [entity.strip() for sublist in responses_entities for entity in sublist])

# Removing the 'r"' and '"' around the dictionary keys for comparison
dictionary_keys = set([key.strip('r"') for key in entity_replacements.keys()])

# Comparing the two sets
missing_in_dict = unique_entities - dictionary_keys
extra_in_dict = dictionary_keys - unique_entities

# Output results
print(f"Entities missing in the dictionary: {missing_in_dict}")
print(f"Extra entities in the dictionary: {extra_in_dict}")

# Checking if both sets match
if not missing_in_dict and not extra_in_dict:
    print("All entities match!")
else:
    print("There are unmatched entities. Please review the results above.")

Entities missing in the dictionary: {'{{Switch to {{Account Category}}', '{{{{Account Type}}', '{{Switch to {{Account Type}}', '{{Upgrade to {{Account Type}}', '{{Account Category}}', '{{Choose the {{Account Type}}', '{{Account Type}}', '{{{{Account Category}}'}
Extra entities in the dictionary: {'{{Premium Membership}}', '{{Switch to Gold}}', '{{Switch to Premium Account}}', '{{Gold Upgrade}}', '{{Choose the Premium profile}}', '{{Upgrade to Premium}}'}
There are unmatched entities. Please review the results above.


In [7]:
# During First pass: Replace inner entities
def first_pass_replace(text):
    for entity, replacement in nested_entities_replacements.items():
        text = re.sub(entity, replacement, text)
    return text

# During Second pass: Replace outer entities
def second_pass_replace(text):
    for entity, replacement in entity_replacements.items():
        text = re.sub(entity, replacement, text)
    return text

# Applying the two-pass replacement process
df['instruction'] = df['instruction'].apply(lambda x: second_pass_replace(first_pass_replace(x)))
df['response'] = df['response'].apply(lambda x: second_pass_replace(first_pass_replace(x)))

# Task 2 - Multilingual Dataset Creation 

- *Create split_text_with_formatting Function to Split Data into Chunks for Translation*
- *Create Translate function to translate English data to French and Spanish*
- *Use MarianMT models **opus-mt-es-fr** and **opus-mt-en-es** to Create the translated data*
- *Combine the translated data into one Multilingual Dataframe*
- *Format the data using lang_codes and special-tokens for M-Bart Training*
- *Save the Multilingual Dataset*

In [8]:
# Function to split long text into smaller chunks while preserving line breaks
def split_text_with_formatting(text, max_chunk_size=500):
    """Split text into smaller chunks while preserving formatting and line breaks."""
    paragraphs = text.split('\n')
    chunks = []
    current_chunk = ""

    for paragraph in paragraphs:
        sentences = re.split(r'(?<=[.!?]) +', paragraph)
        for sentence in sentences:
            if len(current_chunk) + len(sentence) <= max_chunk_size:
                current_chunk += sentence + " "
            else:
                chunks.append(current_chunk.strip())
                current_chunk = sentence + " "

        if current_chunk.strip():
            chunks.append(current_chunk.strip())
            current_chunk = ""

    if current_chunk.strip():
        chunks.append(current_chunk.strip())

    return chunks

In [9]:
# Output path for the multilingual dataset
output_path = 'data/Multilingual_Customer_Support_Training_Dataset.csv'

# Check if the multilingual dataset already exists
if not os.path.exists(output_path):
    print("Translating dataset into french and spanish\n")

    # Load the MarianMT models and ensure they use the GPU
    models = {
        'en-fr': MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-fr").to('cuda'),
        'en-es': MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-es").to('cuda'),
    }

    # Load the tokenizers
    tokenizers = {
        'en-fr': MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr"),
        'en-es': MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-es"),
    }

    def translate(text, src_lang, tgt_lang, max_length=512):
        """Function to translate text from src_lang to tgt_lang with dynamic padding and truncation."""
        model = models[f'{src_lang}-{tgt_lang}']
        tokenizer = tokenizers[f'{src_lang}-{tgt_lang}']

        chunks = split_text_with_formatting(text)
        translated_chunks = []

        for chunk in chunks:
            # Tokenize with dynamic padding and truncation
            inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to('cuda')
    
            # Generate translations
            translated = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=max_length)
    
            # Decode the translated text and clean up tokenization spaces
            translated_text = tokenizer.decode(translated[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
            translated_chunks.append(translated_text)

        return '\n'.join(translated_chunks)

    # Add progress bars to the translation process using tqdm
    # Translate instructions and responses to French and Spanish
    tqdm.pandas(desc="Translating Instructions to French")
    df['instruction_fr'] = df['instruction'].progress_apply(lambda x: translate(x, 'en', 'fr'))

    tqdm.pandas(desc="Translating Instructions to Spanish")
    df['instruction_es'] = df['instruction'].progress_apply(lambda x: translate(x, 'en', 'es'))

    tqdm.pandas(desc="Translating Responses to French")
    df['response_fr'] = df['response'].progress_apply(lambda x: translate(x, 'en', 'fr'))

    tqdm.pandas(desc="Translating Responses to Spanish")
    df['response_es'] = df['response'].progress_apply(lambda x: translate(x, 'en', 'es'))

else:
    print(f"Multilingual dataset already exists at {output_path}. Skipping Translation.")

Translating dataset into french and spanish



Translating Instructions to French: 100%|██████████████████████████████████████| 26872/26872 [17:27<00:00, 25.65it/s]
Translating Instructions to Spanish: 100%|█████████████████████████████████████| 26872/26872 [14:50<00:00, 30.16it/s]
Translating Responses to French: 100%|███████████████████████████████████████| 26872/26872 [2:34:07<00:00,  2.91it/s]
Translating Responses to Spanish: 100%|██████████████████████████████████████| 26872/26872 [2:27:13<00:00,  3.04it/s]


In [10]:
# Check if the multilingual dataset already exists
if not os.path.exists(output_path):
    print("Combining Translated data into a single file")
    
    # Create the multilingual dataframe as an empty DataFrame
    multilingual_df = pd.DataFrame()

    # Handle English rows
    temp_df_en = df[['flags', 'category', 'intent', 'instruction', 'response']].copy()
    temp_df_en['language'] = 'en'

    # Handle French rows
    temp_df_fr = df[['flags', 'category', 'intent']].copy()
    temp_df_fr['instruction'] = df['instruction_fr']
    temp_df_fr['response'] = df['response_fr']
    temp_df_fr['language'] = 'fr'

    # Handle Spanish rows
    temp_df_es = df[['flags', 'category', 'intent']].copy()
    temp_df_es['instruction'] = df['instruction_es']
    temp_df_es['response'] = df['response_es']
    temp_df_es['language'] = 'es'

    # Concatenate all three language DataFrames into a single multilingual DataFrame
    multilingual_df = pd.concat([temp_df_en, temp_df_fr, temp_df_es], ignore_index=True)
    
    # Converting the dataset to the format required by mbart for finetuning
    # Define language codes
    lang_codes = {
        'en': 'en_XX', 
        'fr': 'fr_XX',
        'es': 'es_XX'
    }

    # Add language tokens to existing instruction and response columns
    multilingual_df['instruction'] = multilingual_df.apply(lambda row: f"{lang_codes[row['language']]} {row['instruction']} </s>", axis=1)
    multilingual_df['response'] = multilingual_df.apply(lambda row: f"{lang_codes[row['language']]} {row['response']} </s>", axis=1)

    # Save the multilingual dataset for future use
    multilingual_df.to_csv(output_path, index=False)
    print(f"Multilingual dataset saved to {output_path}")

else:
    print(f"Multilingual dataset already exists at {output_path}. Skipping dataset creation.")

Combining Translated data into a single file
Multilingual dataset saved to data/Multilingual_Customer_Support_Training_Dataset.csv
