In [4]:
%pip install transformers pandas

Collecting transformers
  Downloading transformers-4.45.2-py3-none-any.whl (9.9 MB)
Collecting huggingface-hub<1.0,>=0.23.2
  Downloading huggingface_hub-0.25.2-py3-none-any.whl (436 kB)
Collecting tokenizers<0.21,>=0.20
  Downloading tokenizers-0.20.1-cp39-none-win_amd64.whl (2.4 MB)
Collecting safetensors>=0.4.1
  Downloading safetensors-0.4.5-cp39-none-win_amd64.whl (286 kB)
Collecting fsspec>=2023.5.0
  Downloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
Installing collected packages: fsspec, huggingface-hub, tokenizers, safetensors, transformers
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2021.10.1
    Uninstalling fsspec-2021.10.1:
      Successfully uninstalled fsspec-2021.10.1
Successfully installed fsspec-2024.9.0 huggingface-hub-0.25.2 safetensors-0.4.5 tokenizers-0.20.1 transformers-4.45.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
import ipywidgets as widgets
from IPython.display import display
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import pandas as pd
import re

In [7]:
%pip install torch torchvision torchaudio

Collecting torch
  Downloading torch-2.4.1-cp39-cp39-win_amd64.whl (199.3 MB)
Collecting torchvision
  Downloading torchvision-0.19.1-cp39-cp39-win_amd64.whl (1.3 MB)
Collecting torchaudio
  Downloading torchaudio-2.4.1-cp39-cp39-win_amd64.whl (2.4 MB)
Collecting typing-extensions>=4.8.0
  Using cached typing_extensions-4.12.2-py3-none-any.whl (37 kB)
Installing collected packages: typing-extensions, torch, torchvision, torchaudio
  Attempting uninstall: typing-extensions
    Found existing installation: typing-extensions 3.10.0.2
    Uninstalling typing-extensions-3.10.0.2:
      Successfully uninstalled typing-extensions-3.10.0.2
Successfully installed torch-2.4.1 torchaudio-2.4.1 torchvision-0.19.1 typing-extensions-4.12.2
Note: you may need to restart the kernel to use updated packages.


In [4]:
# Load GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [5]:
# Set the pad_token_id to eos_token_id to avoid warnings
tokenizer.pad_token_id = tokenizer.eos_token_id

In [15]:
# Data cleansing function
def clean_text(text):
    if pd.isna(text):
        return ""
    # Removing unwanted characters like multiple commas, quotes, and extra spaces
    text = re.sub(r'[^a-zA-Z0-9\s-]', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()  
    return text

In [16]:
# Step to remove duplicate product names in the same record
def remove_duplicate_product_names(product_name):
    # Split the product name into words
    words = product_name.split()

    # Create an empty list to store the non-repeated words
    seen_words = []
    for word in words:
        # Add word to seen_words if it's not already added
        if ' '.join(seen_words).find(word) == -1:
            seen_words.append(word)
        else:
            break  # Break the loop if repetition is detected

    # Join the unique words to recreate the cleaned product name
    return ' '.join(seen_words)

In [17]:
# Step 1: Load and preprocess the product names from CSV
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)  # Assuming the file is a CSV
    # Clean the dataset (remove duplicates, NaN values, etc.)
    df = df.dropna().drop_duplicates()

    # Apply cleaning to the 'Product Name' column
    df['Product Name'] = df['Product Name'].apply(clean_text)

    # Remove empty or invalid entries after cleaning
    df = df[df['Product Name'].str.len() > 0]
    
    # Apply the cleaning function to each product name in the dataframe
    df['Product Name'] = df['Product Name'].apply(remove_duplicate_product_names)
    product_names = df['Product Name'].tolist()  # Get the list of product names
    return product_names

# Load your dataset (replace 'product_names.csv' with your actual file path)
product_names = load_and_preprocess_data('dataset/product_names.csv')

In [18]:
product_names

['All-New Fire HD 8 Tablet',
 'Kindle Oasis E-reader with Leather Charging Cover',
 'Amazon Kindle Lighted Leather Cover',
 'Amazon Kindle Lighted Leather Cover',
 'Kindle Keyboard',
 'All-New Fire HD 8 Tablet',
 'Fire HD 8 Tablet with Alexa',
 'Amazon 5W USB Official OEM Charger and Power Adapter for Fire Tablets',
 'All-New Kindle E-reader',
 'Amazon Kindle Fire Hd 3rd Generation 8gb',
 'Fire Tablet 7 Display Wi-Fi 8 GB',
 'Kindle Oasis E-reader with Leather Charging Cover',
 'Amazon - Kindle Voyage',
 'Amazon - Kindle Voyage',
 'Fire HD 8 Tablet with Alexa',
 'Amazon Standing Protective Case for Fire HD 6 4th Generation - Black',
 'Certified Refurbished Amazon Fire TV Previous Generation - 1st',
 'Brand New Amazon Kindle Fire 16gb 7 Ips Display Tablet Wifi',
 'Amazon Kindle Touch Leather Case 4th Generation - 2011 Release Olive Green',
 'Fire Kids Edition Tablet 7 Display Wi-Fi 16 GB Green Kid-Proof Case',
 'Amazon Kindle Paperwhite - eBook reader',
 'Kindle Voyage E-reader 6 High-R

In [19]:
# Step 2: Implement the retrieval system
def retrieve_similar(input_text, product_list, max_results=5):
    input_text_lower = input_text.lower()  # Convert input to lowercase for comparison
    retrieved_products = [name for name in product_list if input_text_lower in name.lower()]  # Simple substring matching
    return retrieved_products[:max_results]  # Limit to top max_results

In [20]:
# Step 3: Generate text from retrieved products
def generate_completion(input_text, retrieved_products, max_new_tokens=30, num_return_sequences=1):
    # Join retrieved product names with a newline for better separation
    input_with_context = "\n".join(retrieved_products)

    # Tokenize input
#     input_ids = tokenizer.encode(input_with_context, return_tensors="pt")

    # Tokenize input with attention mask
    inputs = tokenizer(input_with_context, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']  # Set attention mask

    # Generate auto-completion suggestions
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,  # Pass the attention mask to the model
        max_new_tokens=max_new_tokens,  # Set the number of new tokens to generate
        num_return_sequences=num_return_sequences,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id  # Resolve the padding issue
    )

    # Decode and return generated text
    suggestions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

    # Post-process suggestions: Split by newlines, remove duplicates, and clean up
    processed_suggestions = []
    for suggestion in suggestions:
        # Split the suggestion into product names
        product_names = suggestion.split("\n")

        # Deduplicate and clean up the product names
        unique_names = list(dict.fromkeys(product_names))  # Remove duplicates while preserving order

        # Join unique product names back together
        cleaned_suggestion = "\n".join(unique_names).strip()  # Ensure each name is on a new line and clean trailing spaces

        processed_suggestions.append(cleaned_suggestion)

    return processed_suggestions

In [21]:
# Step 4: Update the UI to display suggestions dynamically
def on_text_change(change):
    # Get the input text
    input_text = change['new'].strip()

    # If input text is not empty, retrieve suggestions and display
    if input_text:
        retrieved = retrieve_similar(input_text, product_names)  # Retrieve similar products from the dataset
        if retrieved:
            suggestions = generate_completion(input_text, retrieved)  # Get suggestions using model
            # Update output box
            suggestions_output.value = "\n\n".join(suggestions)
        else:
            suggestions_output.value = "No suggestions found."
    else:
        suggestions_output.value = ""  # Clear the output if input is empty

# Create a text input box
text_input = widgets.Text(
    value='',
    placeholder='Type a product name...',
    description='Search:',
    disabled=False
)

# Create a text area for displaying suggestions
suggestions_output = widgets.Textarea(
    value='',
    placeholder='Suggestions will appear here...',
    description='Suggestions:',
    layout=widgets.Layout(width='80%', height='150px')
)

# Attach the text input box to a callback function to dynamically update suggestions
text_input.observe(on_text_change, names='value')

# Display the input box and output area in the notebook
display(text_input, suggestions_output)


Text(value='', description='Search:', placeholder='Type a product name...')

Textarea(value='', description='Suggestions:', layout=Layout(height='150px', width='80%'), placeholder='Sugges…