In [1]:
# Install necessary libraries
!pip install pandas scikit-learn transformers gradio torch --quiet

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import gradio as gr

# Step 1: Load Dataset from CSV
# Assuming the CSV file is in the same directory, you can replace 'your_file.csv' with the actual path to your CSV file
data = pd.read_csv('/content/user_personalized_features.csv')

# Combine features into a single text input for LLM
data["input_text"] = (
    "Age: " + data["Age"].astype(str) + ", " +
    "Gender: " + data["Gender"] + ", " +
    "Location: " + data["Location"] + ", " +
    "Income: " + data["Income"].astype(str) + ", " +
    "Interests: " + data["Interests"] + ", " +
    "Purchase Frequency: " + data["Purchase_Frequency"].astype(str) + ", " +
    "Total Spending: " + data["Total_Spending"].astype(str)
)

# Target variable for recommendations
data["output_text"] = data["Product_Category_Preference"]

# Step 2: Train-Test Split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Step 3: Fine-tune LLM for Recommendations
# Use Hugging Face's transformers library
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize data
train_encodings = tokenizer(list(train_data["input_text"]), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_data["input_text"]), truncation=True, padding=True, max_length=128)

# Prepare dataset
class RecommendationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Convert text labels to numerical indices
label_mapping = {label: idx for idx, label in enumerate(data["output_text"].unique())}
train_labels = train_data["output_text"].map(label_mapping).tolist()
test_labels = test_data["output_text"].map(label_mapping).tolist()

train_dataset = RecommendationDataset(train_encodings, train_labels)
test_dataset = RecommendationDataset(test_encodings, test_labels)

# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_mapping))

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=10,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Step 4: Gradio Interface
def recommend(user_input):
    try:
        # Preprocess input
        inputs = tokenizer(user_input, return_tensors="pt", truncation=True, padding=True, max_length=128)

        # Ensure model is in evaluation mode and doesn't update gradients
        model.eval()

        with torch.no_grad():
            outputs = model(**inputs)

        # Apply softmax to logits to get probabilities
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1)

        # Get the predicted category index
        prediction = torch.argmax(probabilities, dim=-1).item()

        # Map the predicted index to the corresponding category label
        predicted_category = list(label_mapping.keys())[prediction]

        return predicted_category

    except Exception as e:
        return f"Error: {str(e)}"

# Create Gradio interface
interface = gr.Interface(
    fn=recommend,
    inputs=gr.Textbox(label="Enter user details (e.g., Age, Gender, Location, Income, Interests, Purchase Frequency, Total Spending)"),
    outputs="text",
    title="Personalized Product Recommendation",
    description="Enter user details in the following format:\n"
                "Age: 25, Gender: Male, Location: Suburban, Income: 50000, Interests: Sports, "
                "Purchase Frequency: 5, Total Spending: 2000"
)

# Launch Gradio app
interface.launch(debug=True)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.1/57.1 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.1/320.1 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m90.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.3/73.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.2/130.2 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,1.6063,1.619072
2,1.6074,1.623101
3,1.6131,1.619429


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://6504bbf37dd950fd65.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://6504bbf37dd950fd65.gradio.live




In [2]:
# Step 4: Gradio Interface
def recommend(user_input):
    try:
        # Detect if CUDA (GPU) is available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Move model to the correct device
        model.to(device)

        # Preprocess input
        inputs = tokenizer(user_input, return_tensors="pt", truncation=True, padding=True, max_length=128)

        # Move input tensors to the same device as the model
        inputs = {key: value.to(device) for key, value in inputs.items()}

        # Ensure model is in evaluation mode and doesn't update gradients
        model.eval()

        with torch.no_grad():
            # Forward pass (no gradients needed)
            outputs = model(**inputs)

        # Apply softmax to logits to get probabilities
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1)

        # Get the predicted category index
        prediction = torch.argmax(probabilities, dim=-1).item()

        # Map the predicted index to the corresponding category label
        predicted_category = list(label_mapping.keys())[prediction]

        return predicted_category

    except Exception as e:
        return f"Error: {str(e)}"

# Create Gradio interface
interface = gr.Interface(
    fn=recommend,
    inputs=gr.Textbox(label="Enter user details (e.g., Age, Gender, Location, Income, Interests, Purchase Frequency, Total Spending)"),
    outputs="text",
    title="Personalized Product Recommendation",
    description="Enter user details in the following format:\n"
                "Age: 25, Gender: Male, Location: Suburban, Income: 50000, Interests: Sports, "
                "Purchase Frequency: 5, Total Spending: 2000"
)

# Launch Gradio app
interface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b6aaa1c1455b93d209.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [3]:
trainer.evaluate()

{'eval_loss': 1.6194294691085815,
 'eval_runtime': 0.8876,
 'eval_samples_per_second': 225.319,
 'eval_steps_per_second': 28.165,
 'epoch': 3.0}