# 1. Importing the CSV

In [5]:
import pandas as pd
df = pd.read_csv('Dataset.csv')
df.head()

Unnamed: 0,Prompt,Feature Vector
0,I want to open a new cake shop. Suggest me reg...,"[ 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1,..."
1,Can you recommend areas with a strong market f...,"[ 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1,..."
2,What are some prime locations for opening a ne...,"[ 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1,..."
3,Where should I open a grocery shop?,"[ 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,..."
4,What are the best regions to open a grocery st...,"[ 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,..."


# 2. Preprocessing

In [32]:
import pandas as pd
import ast

# Remove quotes from the first column
df['query'] = df['query'].str.replace('"', '')

# Convert the second column (labels) to a list if it is in string format
df['label'] = df['label'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Convert the list in the second column to a comma-separated string
df['label'] = df['label'].apply(lambda x: ','.join(map(str, x)))


df.to_csv("Preprocessed.csv", index=False)



# 3. Implementing BERT Model

In [29]:
import torch
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import DataLoader, Dataset

# Custom dataset class for shop data
class ShopDataset(Dataset):
    def __init__(self, queries, labels, tokenizer, max_length=128):
        self.queries = queries
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, idx):
        # Tokenize the query
        inputs = self.tokenizer(self.queries[idx], truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}
        label = torch.tensor(self.labels[idx], dtype=torch.float)  # Binary vector as tensor
        return inputs, label

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
classification_head = torch.nn.Linear(model.config.hidden_size, 37)  # Output 50 binary parameters

# Example training function
def train_model(model, classification_head, dataset, epochs=9):
    dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
    optimizer = torch.optim.Adam(list(model.parameters()) + list(classification_head.parameters()), lr=2e-5)
    loss_fn = BCEWithLogitsLoss()

    model.train()
    classification_head.train()

    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            inputs, labels = batch
            outputs = model(**inputs).last_hidden_state[:, 0, :]  # Get [CLS] embedding
            logits = classification_head(outputs)
            loss = loss_fn(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(dataloader)}")

queries = df['query'].tolist()
labels = df['label'].tolist()


# Create dataset and train
dataset = ShopDataset(queries, labels, tokenizer)
train_model(model, classification_head, dataset)

# Predict function
def get_binary_embedding(query):
    model.eval()
    classification_head.eval()

    inputs = tokenizer(query, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    with torch.no_grad():
        cls_embedding = model(**inputs).last_hidden_state[:, 0, :]
        logits = classification_head(cls_embedding)
        binary_embedding = (torch.sigmoid(logits) > 0.5).int()  # Thresholding to get binary output

    return binary_embedding.squeeze().tolist()

# Example prediction
query = "A family-friendly cafe with outdoor seating, free Wi-Fi, and takeaway options, located near a park in an upper-middle-class neighborhood."
embedding = get_binary_embedding(query)
print("Binary embedding:", embedding)




Epoch 1/9, Loss: 0.6849106550216675
Epoch 2/9, Loss: 0.6167730569839478
Epoch 3/9, Loss: 0.5944967985153198
Epoch 4/9, Loss: 0.5813985228538513
Epoch 5/9, Loss: 0.5678684949874878
Epoch 6/9, Loss: 0.5568337559700012
Epoch 7/9, Loss: 0.5438450694084167
Epoch 8/9, Loss: 0.5299333095550537
Epoch 9/9, Loss: 0.5150486111640931
Binary embedding: [1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1]


## 3.1. Semantics of the Output

In [30]:
semantics = ["Is it located in residential area?",
"Is it located in commercial/industrial area?",
"Are the people living in the area of High Class?",
"Are the people living in the area of Upper Middle Class?",
"Are the people living in the area of Lower Middle Class?",
"Are the people living in the area of Lower Class?",
"Are the people living in the area of age range 18-22?",
"Are the people living in the area of age range 22-60?",
"Are the people living in the area of age greater than 60?",
"Is the area located in rural region?",
"Is the area located in urban region?",
"Is it located in a mall ",
"Is it stand-alone?",
"Does it offer essential goods ?",
"Does it offer luxury items ?",
"Is it a hardware shop?",
"Is it an apparel shop?",
"Does it focus on eco-friendly products?",
"Is it a wholesale shop?",
"Does it sell seasonal products?",
"Is it open 24/7?",
"Is the shop part of a chain ",
"Is the shop independent ? ",
"Is it accessible for people with disabilities?",
"Is it a tourist shop?",
"custom or personalized products?",
"Generic products?",
"Is it located near educational institutions?",
"Does it offer an in-store pickup for online orders?",
"Does it offer express delivery services?",
"Does it have a social media presence?",
"Is it child-friendly (e.g., has play areas)?",
"Does it sell imported products?",
"Does it cater to a niche market?",
"Does it feature local products?",
"Is it a pet-friendly store?",
"Does it operate only during specific seasons?"]

# print(len(semantics))

for i in range(37):
  print(f'{semantics[i]}: {embedding[i]}')

Is it located in residential area?: 1
Is it located in commercial/industrial area?: 1
Are the people living in the area of High Class?: 1
Are the people living in the area of Upper Middle Class?: 1
Are the people living in the area of Lower Middle Class?: 1
Are the people living in the area of Lower Class?: 0
Are the people living in the area of age range 18-22?: 1
Are the people living in the area of age range 22-60?: 1
Are the people living in the area of age greater than 60?: 0
Is the area located in rural region?: 0
Is the area located in urban region?: 1
Is it located in a mall : 0
Is it stand-alone?: 1
Does it offer essential goods ?: 0
Does it offer luxury items ?: 0
Is it a hardware shop?: 0
Is it an apparel shop?: 0
Does it focus on eco-friendly products?: 1
Is it a wholesale shop?: 1
Does it sell seasonal products?: 1
Is it open 24/7?: 0
Is the shop part of a chain : 0
Is the shop independent ? : 1
Is it accessible for people with disabilities?: 1
Is it a tourist shop?: 0
cus

## 3.2. Saving the Model

In [34]:
import torch

# Save the model's state dictionary
torch.save(model.state_dict(), "bert_classifier_state_dict.pth")
print("Model state dictionary saved to 'bert_classifier_state_dict.pth'")


Model state dictionary saved to 'bert_classifier_state_dict.pth'
