In [1]:
pip install pandas transformers torch google-colab


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [2]:
import pandas as pd
from google.colab import files
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

def upload_and_read_csv():
    uploaded = files.upload()
    for filename in uploaded.keys():
        df = pd.read_csv(filename)
        print(f"Dataset '{filename}' successfully loaded.")
        print(f"Columns: {df.columns.tolist()}")
        return df

def initialize_model():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    return tokenizer, model

def encode_text(texts, tokenizer, model):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

def generate_recommendation(query, df, embeddings, model, tokenizer):
    query_embedding = encode_text([query], tokenizer, model)
    similarities = np.dot(embeddings, query_embedding.T).flatten()
    most_similar_idx = similarities.argmax()
    if similarities[most_similar_idx] < 0.5:
        return "Sorry, I couldn't find a strong recommendation based on your query."
    best_match = df.iloc[most_similar_idx]
    return best_match.to_dict(), similarities[most_similar_idx]

def main():
    df = upload_and_read_csv()
    tokenizer, model = initialize_model()
    combined_text = df.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
    embeddings = encode_text(combined_text.tolist(), tokenizer, model)
    while True:
        user_query = input("\nWhat do you need a recommendation for? (type 'exit' to quit): ").strip()
        if user_query.lower() == 'exit':
            break
        recommendation = generate_recommendation(user_query, df, embeddings, model, tokenizer)

        if isinstance(recommendation, str):
            print(f"\n{recommendation}")
        else:
            print(f"\nAI Recommendation (Confidence {recommendation[1]:.2f}):\n{recommendation[0]}")

if __name__ == "__main__":
    main()


Saving blazers_large.csv to blazers_large.csv
Dataset 'blazers_large.csv' successfully loaded.
Columns: ['item_name', 'size', 'color', 'style', 'description']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


What do you need a recommendation for? (type 'exit' to quit): A blazer for an important meeting

AI Recommendation (Confidence 72.06):
{'item_name': 'Blazer', 'size': 'M', 'color': 'White', 'style': 'Semi-Formal', 'description': 'A White semi-formal blazer suitable for business meetings.'}

What do you need a recommendation for? (type 'exit' to quit): Something more formal

AI Recommendation (Confidence 49.47):
{'item_name': 'Blazer', 'size': 'M', 'color': 'Navy', 'style': 'Semi-Formal', 'description': 'A Navy blazer for a sophisticated look.'}

What do you need a recommendation for? (type 'exit' to quit): How about for a party

AI Recommendation (Confidence 53.37):
{'item_name': 'Blazer', 'size': 'L', 'color': 'Pink', 'style': 'Semi-Formal', 'description': 'A versatile Pink blazer that can be dressed up or down.'}

What do you need a recommendation for? (type 'exit' to quit): Something other than pink

AI Recommendation (Confidence 49.37):
{'item_name': 'Blazer', 'size': 'S', 'color'

In [6]:
import pandas as pd
from google.colab import files
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import re

def upload_and_read_csv():
    uploaded = files.upload()
    for filename in uploaded.keys():
        df = pd.read_csv(filename)
        print(f"Dataset '{filename}' successfully loaded.")
        print(f"Columns: {df.columns.tolist()}")
        return df

def initialize_model():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    return tokenizer, model

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

def encode_text(texts, tokenizer, model):
    texts = [clean_text(text) for text in texts]
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

def generate_recommendation(query, df, embeddings, model, tokenizer):
    query_embedding = encode_text([query], tokenizer, model)
    similarities = np.dot(embeddings, query_embedding.T).flatten()
    if similarities.max() < 0.5:
        return "Sorry, I couldn't find a strong recommendation based on your query."
    most_similar_idx = similarities.argmax()
    best_match = df.iloc[most_similar_idx]
    return best_match.to_dict(), similarities[most_similar_idx]

def main():
    df = upload_and_read_csv()
    tokenizer, model = initialize_model()
    combined_text = df.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
    embeddings = encode_text(combined_text.tolist(), tokenizer, model)
    while True:
        user_query = input("\nWhat do you need a recommendation for? (type 'exit' to quit): ").strip()
        if user_query.lower() == 'exit':
            break
        recommendation = generate_recommendation(user_query, df, embeddings, model, tokenizer)

        if isinstance(recommendation, str):
            print(f"\n{recommendation}")
        else:
            print(f"\nAI Recommendation (Confidence {recommendation[1]:.2f}):\n{recommendation[0]}")

if __name__ == "__main__":
    main()


Saving blazers_large.csv to blazers_large (4).csv
Dataset 'blazers_large (4).csv' successfully loaded.
Columns: ['item_name', 'size', 'color', 'style', 'description']

What do you need a recommendation for? (type 'exit' to quit): A blazer for wedding

AI Recommendation (Confidence 76.26):
{'item_name': 'Blazer', 'size': 'L', 'color': 'White', 'style': 'Semi-Formal', 'description': 'A versatile White blazer that can be dressed up or down.'}

What do you need a recommendation for? (type 'exit' to quit): exit
