In [1]:
import random
train_data = []

data_point_num = 10000

# User Intent 1: Requesting cab availability
cab_availability_data = []

availability_questions = [
    "Is there a cab that can take me to {}?",
    "Are there any cabs going to {} soon?",
    "Can I get a ride to {}?",
    "How soon can I get a cab to {}?",
    "Any available cabs for {} now?",
    "What's the waiting time for a cab to {}?"
]

destinations = [
    "Central Park",
    "Times Square",
    "Madison Square Garden",
    "Empire State Building",
    "Museum of Modern Art",
    "Statue of Liberty",
    "JFK Airport",
    "LaGuardia Airport",
    "Brooklyn Bridge",
    "Wall Street"
]

for _ in range(data_point_num):
    destination = random.choice(destinations)
    user_query = random.choice(availability_questions).format(destination)
    
    # Introduce variety in response time details
    minutes = random.randint(1, 15)
    rough_times = ["a few", "several", "a couple of"]
    approx_time = random.choice(rough_times)
    
    positive_responses = [
        f"Yes, we have a cab {minutes} minutes away for {destination}.",
        f"A cab is on its way and should be there in about {minutes} minutes for {destination}.",
        f"There's a cab just around the corner! It'll take approximately {minutes} minutes to reach you for your trip to {destination}.",
        f"You're in luck! A cab is {approx_time} minutes away from {destination}."
    ]
    
    negative_responses = [
        f"I'm sorry, there are currently no cabs available for {destination}.",
        f"It seems all our cabs are booked at the moment for trips to {destination}. Can you wait a bit?",
        f"We're experiencing high demand right now for {destination}. It might take longer than usual."
    ]
    
    # Decide if a positive or negative response should be given (80-20 split for this example)
    response_type_choice = random.choices(
        ["positive", "negative"], 
        weights=[0.8, 0.2], 
        k=1
    )[0]
    
    if response_type_choice == "positive":
        response = random.choice(positive_responses)
    else:
        response = random.choice(negative_responses)
    
    cab_availability_data.append([user_query, response])


# User Intent 2: Inquiring about fare estimates
fare_estimate_data = []

fare_questions = [
    "What would be the fare to {}?",
    "How much would it cost to go to {}?",
    "Can you give me a price estimate for a trip to {}?",
    "What's the charge for a ride to {}?",
    "What's the expected fare for {}?",
    "Give me a ballpark figure for a ride to {}."
]

for _ in range(data_point_num):
    destination = random.choice(destinations)
    user_query = random.choice(fare_questions).format(destination)
    
    fare_low = random.randint(10, 40)
    fare_high = fare_low + random.randint(5, 15)  # Upper estimate range
    
    standard_responses = [
        f"The estimated fare to {destination} is approximately ${fare_low:.2f}.",
        f"You'd be looking at around ${fare_low:.2f} to get to {destination}.",
        f"Expect the fare to {destination} to be close to ${fare_low:.2f}."
    ]
    
    range_responses = [
        f"The estimated fare to {destination} is between ${fare_low:.2f} and ${fare_high:.2f} depending on traffic.",
        f"A trip to {destination} could cost anywhere from ${fare_low:.2f} to ${fare_high:.2f}, based on the route taken.",
        f"Based on current demand, the fare to {destination} is roughly ${fare_low:.2f} to ${fare_high:.2f}."
    ]
    
    uncertain_responses = [
        f"The fare to {destination} can vary, but it's usually around ${fare_low:.2f}.",
        f"It's hard to pinpoint an exact fare now, but I'd estimate near ${fare_low:.2f} for {destination}.",
        f"With the current traffic, it might be close to ${fare_low:.2f} to reach {destination}."
    ]
    
    # Decide the type of response (even distribution for this example, can be adjusted)
    response_type_choice = random.choices(
        ["standard", "range", "uncertain"], 
        weights=[0.5, 0.3, 0.2], 
        k=1
    )[0]
    
    if response_type_choice == "standard":
        response = random.choice(standard_responses)
    elif response_type_choice == "range":
        response = random.choice(range_responses)
    else:
        response = random.choice(uncertain_responses)
    
    fare_estimate_data.append([user_query, response])


# User Intent 3: Frequently Asked Questions
faq_data = []

faq_dictionary = {
    "What hours do you operate?": [
        "We're available 24/7.", 
        "Our cabs operate around the clock, 24/7.", 
        "Anytime! We're always operational."
    ],
    "Tell me your operating times.": [
        "Our services run 24/7 for your convenience.",
        "You can book with us any time of the day, 365 days a year.",
        "We don't close! Available every hour, every day."
    ],
    "When can I book a cab?": [
        "Feel free to book anytime, we're open 24/7.",
        "We're at your service around the clock. Book whenever you need.",
        "Day or night, we're here for you!"
    ],
    "Can I pay using my credit card?": [
        "Yes, we accept all major credit cards and online payment methods.",
        "Absolutely! All major cards are supported.",
        "Definitely! Pay the way you prefer, including credit cards."
    ],
    "Do you guys accept cards?": [
        "Yes, we take all major credit and debit cards.",
        "Cards? Absolutely! We support most of them.",
        "Of course! Whether it's Visa, Mastercard, or others, we've got you covered."
    ],
    "What's the process to cancel my booking?": [
        "It's simple. Use our app or call our customer service. Remember, cancel at least an hour ahead.",
        "Go to our app, find your booking, and hit 'Cancel'. Or, give us a ring!",
        "Through the app is the quickest. If not, our helpline is always available."
    ],
    "How do I go about canceling a ride?": [
        "Open our app, find your trip, and tap on 'Cancel'. Alternatively, you can always call us.",
        "Cancellations are easy! Just ensure you do it in advance to avoid charges.",
        "Either through the app or by calling our support. We recommend doing it an hour before your trip."
    ]
}

faq_questions = list(faq_dictionary.keys())

for _ in range(data_point_num):
    user_query = random.choice(faq_questions)
    response = random.choice(faq_dictionary[user_query])
    faq_data.append([user_query, response])


# User Intent 4: Personalized cab recommendations
personalized_recommendations_data = []

recommendations_dictionary = {
    "What cab suits my previous bookings?": [
        "Considering your past preferences, we'd suggest our executive sedan.",
        "From what you've chosen before, our standard sedan might be your go-to.",
        "How about our premium SUV? It aligns with your previous bookings."
    ],
    "Can you suggest a comfy ride?": [
        "Our plush sedan is known for its comfort. Give it a try!",
        "An executive SUV from us guarantees a smooth ride.",
        "For a relaxing journey, nothing beats our premium sedans."
    ],
    "I'm looking for eco-friendly options. Any recommendations?": [
        "Certainly! Our electric vehicle fleet is top-notch and eco-friendly.",
        "Our hybrid cabs are a perfect blend of efficiency and eco-friendliness.",
        "Go green with our latest electric sedan!"
    ],
    "I want a luxurious experience. What do you suggest?": [
        "Nothing speaks luxury like our exclusive luxury lineup.",
        "Why not try our luxury SUV? It's designed for superior experiences.",
        "Our premium class offers unparalleled luxury and comfort."
    ],
    "I need a cheap option. What do you have?": [
        "Our basic hatchback is not only reliable but also budget-friendly.",
        "For an economical trip, our standard sedan does wonders.",
        "Our compact cars are designed to be light on your pocket."
    ]
}

recommendation_questions = list(recommendations_dictionary.keys())

for _ in range(data_point_num):
    user_query = random.choice(recommendation_questions)
    response = random.choice(recommendations_dictionary[user_query])
    personalized_recommendations_data.append([user_query, response])

# User Intent 5: Specifying cab preferences
cab_preferences_data = []

preferences_dictionary = {
    "Do you have cabs with a child seat?": [
        "Certainly! We'll send a cab with a child seat for you.",
        "Of course, we can arrange cabs equipped with child seats.",
        "No worries, let's arrange a cab with a child seat for you."
    ],
    "I need a ride that's wheelchair accessible.": [
        "Absolutely, we prioritize accessibility. We'll arrange a wheelchair-accessible cab.",
        "Yes, we do have cabs that are wheelchair accessible. We'll send one right away.",
        "For sure, let us arrange a wheelchair-accessible cab for you."
    ],
    "Got any cabs with extra legroom?": [
        "Sure thing! Our SUVs have that extra space you're looking for.",
        "Absolutely, our executive sedans offer ample legroom.",
        "You'll love our spacious vehicles designed especially for that extra legroom."
    ],
    "Can I get a cab with a luggage rack?": [
        "Yes, let's get you a cab equipped with a luggage rack.",
        "Of course, we'll arrange a cab with a luggage rack for you.",
        "Absolutely, we have cabs ready with luggage racks."
    ],
    "Do you have cabs with tinted windows?": [
        "Certainly, for those who prefer added privacy, we have cabs with tinted windows.",
        "Yes, let us arrange a cab with tinted windows for you.",
        "For sure, we have a fleet with tinted windows. One's on its way."
    ]
}

non_availability_responses = [
    "I'm sorry, we don't have that feature available at the moment.",
    "Apologies, we're currently out of cabs with that specific feature.",
    "Regrettably, we can't accommodate that request right now."
]

preference_questions = list(preferences_dictionary.keys())

probability_feature_available = 0.8

for _ in range(data_point_num):
    user_query = random.choice(preference_questions)
    
    if random.random() < probability_feature_available:
        response = random.choice(preferences_dictionary[user_query])
    else:
        response = random.choice(non_availability_responses)
    
    cab_preferences_data.append([user_query, response])
"""
# Morning Noonj Evening
times_of_day = ["morning", "noon", "evening"]

morning_faq_responses = [
    "Our morning shift starts at 6 am.",
    "Cabs are readily available in the mornings!",
    "Morning fares are generally consistent and don't see much fluctuation.",
    "Our drivers in the morning shift are refreshed and ready to ensure a smooth ride.",
    "You can expect minimal traffic during early morning rides."
]

noon_faq_responses = [
    "During noon, there might be peak charges due to high demand.",
    "Noon rides, especially around lunchtime, can experience a bit of traffic.",
    "Our cabs are equipped with air conditioning to ensure a comfortable ride during the hot noon hours.",
    "If you're booking at noon, please allow a slightly longer waiting time due to demand.",
    "Many drivers take their breaks during noon, but we always have a fleet ready for service."
]

evening_faq_responses = [
    "Night charges start from 8 pm onwards.",
    "Evening hours, especially rush hour, might see some delays due to traffic.",
    "For late-night rides, we recommend pre-booking to ensure availability.",
    "Our drivers during the evening shift are experienced with nighttime driving.",
    "We prioritize safety; our evening and night cabs are all equipped with GPS tracking."
]


faq_data = []
for _ in range(data_point_num):
    index = random.randrange(len(faq_questions_templates))
    user_query = faq_questions_templates[index]
    time_slot = random.choice(times_of_day)
    
    if time_slot == "morning":
        response = random.choice(morning_faq_responses)
    elif time_slot == "noon":
        response = random.choice(noon_faq_responses)
    else:  # evening
        response = random.choice(evening_faq_responses)
    
    faq_data.append([user_query, response])
"""

# Frequently Asked Questions with Time-specific Responses
faq_data = []
faq_questions_templates = [
    "What hours do you operate?",
    "Tell me your operating times.",
    "When can I book a cab?",
    "Can I pay using my credit card?",
    "Do you guys accept cards?",
    "What's the process to cancel my booking?",
    "How do I go about canceling a ride?"
]

times_of_day = ["morning", "noon", "evening"]

morning_faq_responses = [
    "Our morning shift starts at 6 am.",
    "Cabs are readily available in the mornings!",
    "Morning fares are generally consistent and don't see much fluctuation.",
    "Our drivers in the morning shift are refreshed and ready to ensure a smooth ride.",
    "You can expect minimal traffic during early morning rides."
]

noon_faq_responses = [
    "During noon, there might be peak charges due to high demand.",
    "Noon rides, especially around lunchtime, can experience a bit of traffic.",
    "Our cabs are equipped with air conditioning to ensure a comfortable ride during the hot noon hours.",
    "If you're booking at noon, please allow a slightly longer waiting time due to demand.",
    "Many drivers take their breaks during noon, but we always have a fleet ready for service."
]

evening_faq_responses = [
    "Night charges start from 8 pm onwards.",
    "Evening hours, especially rush hour, might see some delays due to traffic.",
    "For late-night rides, we recommend pre-booking to ensure availability.",
    "Our drivers during the evening shift are experienced with nighttime driving.",
    "We prioritize safety; our evening and night cabs are all equipped with GPS tracking."
]

for _ in range(data_point_num):
    index = random.randrange(len(faq_questions_templates))
    user_query = faq_questions_templates[index]
    time_slot = random.choice(times_of_day)
    
    if time_slot == "morning":
        response = random.choice(morning_faq_responses)
    elif time_slot == "noon":
        response = random.choice(noon_faq_responses)
    else:  # evening
        response = random.choice(evening_faq_responses)
    
    faq_data.append([user_query, response])



# Combining datasets
datasets = [cab_availability_data, fare_estimate_data, faq_data, personalized_recommendations_data, cab_preferences_data]

for dataset in datasets:
    train_data += dataset

# Shuffle the train_data to mix the different user intents
random.shuffle(train_data)
from datasets import Dataset, DatasetDict

# Sample train_data for the example
train_data

# Split data into separate questions and answers lists
questions, answers = zip(*train_data)

# Convert to Dataset
full_dataset = Dataset.from_dict({
    "question": list(questions),
    "answer": list(answers)
})

# Splitting the dataset into train and validation sets in 4 to 1 ratio
train_size = 0.8  # 80% for training
split_datasets = full_dataset.train_test_split(test_size=1-train_size)

# Creating DatasetDict
dataset = DatasetDict({
    "train": split_datasets["train"],
    "validation": split_datasets["test"]
})

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 40000
    })
    validation: Dataset({
        features: ['question', 'answer'],
        num_rows: 10000
    })
})


In [2]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    # Tokenize the questions and answers
    output = tokenizer(examples['question'], examples['answer'], truncation=True, padding='max_length', max_length=128)
    # For GPT-2, the labels are the same as the input_ids
    output["labels"] = output["input_ids"].copy()
    return output

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [3]:
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer

# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')


In [4]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',  # <-- This is the line you need to add.
    per_device_train_batch_size=16,  # adjust based on your GPU memory
    per_device_eval_batch_size=16,   # adjust based on your GPU memory
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    logging_dir='./logs',
    logging_steps=500,
    save_steps=500,
    eval_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    learning_rate=1e-5,
    weight_decay=0.01,
    push_to_hub=False,
    logging_first_step=True,
    load_best_model_at_end=True,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)


In [5]:
import torch
print(torch.cuda.is_available())


False


In [8]:
# Start training
trainer.train()


Step,Training Loss,Validation Loss
500,0.0599,0.051062
1000,0.0596,0.051062


TrainOutput(global_step=1250, training_loss=0.05986613878011703, metrics={'train_runtime': 43674.2333, 'train_samples_per_second': 0.916, 'train_steps_per_second': 0.029, 'total_flos': 2612920320000000.0, 'train_loss': 0.05986613878011703, 'epoch': 1.0})

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer


In [3]:
model_path = './my_finetuned_gpt2'
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)


In [14]:
def generate_text(prompt, max_length=100, temperature=1.0, top_k=50):
    # Encode the input prompt to tensor
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
    # Generate text
    output = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k
    )
    
    # Decode only the newly generated tokens (excluding the input prompt)
    generated_tokens = output[0][input_ids.shape[1]:] 
    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    
    return generated_text


In [16]:
prompt = "Do you have a cab to Turkey?"
result = generate_text(prompt)
print(prompt)
print(result)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Do you have a cab to Turkey?
Yes, let us arrange a cab for you.
