In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset



In [4]:
data = pd.read_csv('Final_Fashion_Items_Dataset_50k.csv')


In [5]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the descriptions
encodings = tokenizer(data['Description'].tolist(), truncation=True, padding='max_length', max_length=512)



In [6]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import DistilBertTokenizer

# Load the dataset
data = pd.read_csv('Final_Fashion_Items_Dataset_50k.csv')

# Check the format of the 'Recommended Matches' column
print(data['Recommended Matches'].head())

# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the data
encodings = tokenizer(data['Description'].tolist(), max_length=128, truncation=True, padding=True, return_tensors="pt")

# MultiLabel Binarizer for encoding labels
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data['Recommended Matches'].str.split(', '))

# Verify the results
print(f"Shape of labels array: {labels.shape}")
print(f"Label classes: {mlb.classes_}")

# Check for correct label tensor shape and classes
labels_tensor = torch.tensor(labels, dtype=torch.float32)
print(f"Labels tensor shape: {labels_tensor.shape}")
 
combined_data = list(zip(encodings['input_ids'], encodings['attention_mask'], labels_tensor))


0    High-Waisted Skinny Jeans, Faux Leather Mini S...
1    High-Waisted Skinny Jeans, Faux Leather Mini S...
2    Knit Pullover Sweater, Floral Maxi Dress, Athl...
3    Cropped Denim Jacket, Mesh Sports Leggings, Hi...
4     Graphic Print T-shirt, High-Top Basketball Shoes
Name: Recommended Matches, dtype: object
Shape of labels array: (50000, 23)
Label classes: ['Ankle Strap Heels' 'Athletic Track Jacket' 'Beaded Evening Clutch'
 'Boho Style Fringed Vest' 'Canvas Sneakers' 'Casual Linen Blouse'
 'Chunky Gold Hoop Earrings' 'Classic Trench Coat' 'Cropped Denim Jacket'
 'Elegant Satin Blouse' 'Faux Leather Mini Skirt' 'Floral Maxi Dress'
 'Graphic Print T-shirt' 'High-Top Basketball Shoes'
 'High-Waisted Skinny Jeans' 'Knit Pullover Sweater'
 'Leather Crossbody Bag' 'Mesh Sports Leggings'
 'Pleated Chiffon Wide-Leg Pants' 'Silk Evening Gown'
 'Sleeveless Skater Dress' 'Soft Cotton Scarf' 'Wool Fedora Hat']
Labels tensor shape: torch.Size([50000, 23])


In [8]:
# Split the combined data
train_data, val_data = train_test_split(combined_data, test_size=0.2, random_state=42)

# Unpack the training and validation data
train_input_ids, train_attention_mask, train_labels = zip(*train_data)
val_input_ids, val_attention_mask, val_labels = zip(*val_data)

# Convert tuples back to tensors
train_input_ids = torch.stack(train_input_ids)
train_attention_mask = torch.stack(train_attention_mask)
train_labels = torch.stack(train_labels)
val_input_ids = torch.stack(val_input_ids)
val_attention_mask = torch.stack(val_attention_mask)
val_labels = torch.stack(val_labels)



In [18]:
class FashionDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

In [19]:
train_dataset = FashionDataset(train_input_ids, train_attention_mask, train_labels)
val_dataset = FashionDataset(val_input_ids, val_attention_mask, val_labels)


In [10]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(mlb.classes_))


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


  0%|          | 0/15000 [08:25<?, ?it/s]
  3%|▎         | 500/15000 [00:53<22:23, 10.80it/s] 

{'loss': 0.3043, 'grad_norm': 0.7466734647750854, 'learning_rate': 4.8333333333333334e-05, 'epoch': 0.1}


  7%|▋         | 1000/15000 [01:38<19:53, 11.73it/s] 

{'loss': 0.2661, 'grad_norm': 0.6128305196762085, 'learning_rate': 4.666666666666667e-05, 'epoch': 0.2}


 10%|█         | 1500/15000 [02:23<19:37, 11.46it/s]  

{'loss': 0.2636, 'grad_norm': 0.5651799440383911, 'learning_rate': 4.5e-05, 'epoch': 0.3}


 13%|█▎        | 2000/15000 [03:08<19:04, 11.36it/s]

{'loss': 0.2633, 'grad_norm': 0.6109074354171753, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.4}


 17%|█▋        | 2500/15000 [03:54<18:11, 11.45it/s]

{'loss': 0.2627, 'grad_norm': 0.5381370782852173, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.5}


 20%|██        | 3000/15000 [04:39<17:36, 11.35it/s]

{'loss': 0.2618, 'grad_norm': 0.5983831286430359, 'learning_rate': 4e-05, 'epoch': 0.6}


 23%|██▎       | 3500/15000 [05:27<17:10, 11.16it/s]

{'loss': 0.2609, 'grad_norm': 0.5886273980140686, 'learning_rate': 3.8333333333333334e-05, 'epoch': 0.7}


 27%|██▋       | 4000/15000 [06:14<16:25, 11.16it/s]

{'loss': 0.2597, 'grad_norm': 0.5998498797416687, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.8}


 30%|███       | 4500/15000 [06:59<15:28, 11.31it/s]

{'loss': 0.2592, 'grad_norm': 0.6379507780075073, 'learning_rate': 3.5e-05, 'epoch': 0.9}


 33%|███▎      | 5000/15000 [07:44<14:41, 11.34it/s]

{'loss': 0.2606, 'grad_norm': 0.6207805871963501, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}


                                                    
 33%|███▎      | 5001/15000 [08:52<33:37:50, 12.11s/it]

{'eval_loss': 0.2578980028629303, 'eval_runtime': 67.0385, 'eval_samples_per_second': 149.168, 'eval_steps_per_second': 18.646, 'epoch': 1.0}


 37%|███▋      | 5500/15000 [09:36<13:36, 11.64it/s]   

{'loss': 0.2596, 'grad_norm': 0.5176799297332764, 'learning_rate': 3.1666666666666666e-05, 'epoch': 1.1}


 40%|████      | 6000/15000 [10:21<13:02, 11.50it/s]

{'loss': 0.2588, 'grad_norm': 0.5850674510002136, 'learning_rate': 3e-05, 'epoch': 1.2}


 43%|████▎     | 6500/15000 [11:06<12:33, 11.28it/s]

{'loss': 0.2597, 'grad_norm': 0.49838075041770935, 'learning_rate': 2.8333333333333335e-05, 'epoch': 1.3}


 47%|████▋     | 7000/15000 [11:51<11:40, 11.42it/s]

{'loss': 0.2593, 'grad_norm': 0.49713367223739624, 'learning_rate': 2.6666666666666667e-05, 'epoch': 1.4}


 50%|█████     | 7500/15000 [12:36<11:10, 11.18it/s]

{'loss': 0.2587, 'grad_norm': 0.4166128933429718, 'learning_rate': 2.5e-05, 'epoch': 1.5}


 53%|█████▎    | 8000/15000 [13:20<09:59, 11.68it/s]

{'loss': 0.2586, 'grad_norm': 0.4045802354812622, 'learning_rate': 2.3333333333333336e-05, 'epoch': 1.6}


 57%|█████▋    | 8500/15000 [14:05<09:30, 11.40it/s]

{'loss': 0.2584, 'grad_norm': 0.38991448283195496, 'learning_rate': 2.1666666666666667e-05, 'epoch': 1.7}


 60%|██████    | 9000/15000 [14:50<08:36, 11.62it/s]

{'loss': 0.2585, 'grad_norm': 0.4205186069011688, 'learning_rate': 2e-05, 'epoch': 1.8}


 63%|██████▎   | 9500/15000 [15:34<08:02, 11.39it/s]

{'loss': 0.2577, 'grad_norm': 0.4141974151134491, 'learning_rate': 1.8333333333333333e-05, 'epoch': 1.9}


 67%|██████▋   | 10000/15000 [16:20<07:30, 11.10it/s]

{'loss': 0.2582, 'grad_norm': 0.4140099883079529, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


                                                     
 67%|██████▋   | 10001/15000 [16:35<3:44:23,  2.69s/it]

{'eval_loss': 0.25672781467437744, 'eval_runtime': 13.5031, 'eval_samples_per_second': 740.569, 'eval_steps_per_second': 92.571, 'epoch': 2.0}


 70%|███████   | 10500/15000 [17:20<06:48, 11.01it/s]  

{'loss': 0.2574, 'grad_norm': 0.5119720101356506, 'learning_rate': 1.5e-05, 'epoch': 2.1}


 73%|███████▎  | 11000/15000 [18:06<05:50, 11.40it/s]

{'loss': 0.2578, 'grad_norm': 0.3671681880950928, 'learning_rate': 1.3333333333333333e-05, 'epoch': 2.2}


 77%|███████▋  | 11500/15000 [18:51<05:07, 11.38it/s]

{'loss': 0.2595, 'grad_norm': 0.4591037333011627, 'learning_rate': 1.1666666666666668e-05, 'epoch': 2.3}


 80%|████████  | 12000/15000 [19:36<04:26, 11.27it/s]

{'loss': 0.2571, 'grad_norm': 0.3896959722042084, 'learning_rate': 1e-05, 'epoch': 2.4}


 83%|████████▎ | 12500/15000 [20:21<03:41, 11.27it/s]

{'loss': 0.2566, 'grad_norm': 0.3329217731952667, 'learning_rate': 8.333333333333334e-06, 'epoch': 2.5}


 87%|████████▋ | 13000/15000 [21:07<02:57, 11.24it/s]

{'loss': 0.2582, 'grad_norm': 0.37055760622024536, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.6}


 90%|█████████ | 13500/15000 [21:53<02:14, 11.13it/s]

{'loss': 0.2578, 'grad_norm': 0.4167674481868744, 'learning_rate': 5e-06, 'epoch': 2.7}


 93%|█████████▎| 14000/15000 [22:39<01:28, 11.31it/s]

{'loss': 0.2567, 'grad_norm': 0.37434881925582886, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.8}


 97%|█████████▋| 14500/15000 [23:25<00:43, 11.38it/s]

{'loss': 0.258, 'grad_norm': 0.40314170718193054, 'learning_rate': 1.6666666666666667e-06, 'epoch': 2.9}


100%|██████████| 15000/15000 [24:10<00:00, 11.18it/s]

{'loss': 0.2568, 'grad_norm': 0.3662957549095154, 'learning_rate': 0.0, 'epoch': 3.0}


                                                     
100%|██████████| 15000/15000 [24:24<00:00, 10.24it/s]

{'eval_loss': 0.2561521828174591, 'eval_runtime': 13.124, 'eval_samples_per_second': 761.961, 'eval_steps_per_second': 95.245, 'epoch': 3.0}
{'train_runtime': 1464.8859, 'train_samples_per_second': 81.918, 'train_steps_per_second': 10.24, 'train_loss': 0.2608523142496745, 'epoch': 3.0}





TrainOutput(global_step=15000, training_loss=0.2608523142496745, metrics={'train_runtime': 1464.8859, 'train_samples_per_second': 81.918, 'train_steps_per_second': 10.24, 'total_flos': 341645412240000.0, 'train_loss': 0.2608523142496745, 'epoch': 3.0})

In [23]:
results = trainer.evaluate()
print(results)


100%|██████████| 1250/1250 [00:13<00:00, 91.57it/s]

{'eval_loss': 0.2561521828174591, 'eval_runtime': 13.6681, 'eval_samples_per_second': 731.63, 'eval_steps_per_second': 91.454, 'epoch': 3.0}





In [26]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Load the tokenizer and the model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model_path = './results/checkpoint-15000'  # Adjust if your model is saved elsewhere
model = DistilBertForSequenceClassification.from_pretrained(model_path)


In [27]:
import pandas as pd

# Load the dataset
data = pd.read_csv('Final_Fashion_Items_Dataset_50k.csv')
texts = data['Description'].tolist()  # Adjust the column name if different


In [28]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

batch_size = 32  # Adjust based on your memory constraints

In [32]:
# Load your dataset
data = pd.read_csv('Final_Fashion_Items_Dataset_50k.csv')

# Preprocess the data
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenized_data = tokenizer(data['Description'].tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")

# Move tensors to the same device as the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_ids = tokenized_data['input_ids'].to(device)
attention_mask = tokenized_data['attention_mask'].to(device)

# Load the model
model = DistilBertForSequenceClassification.from_pretrained('./results/checkpoint-15000').to(device)
model.eval()

# Make predictions
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)

# Assuming you want the index with the highest probability
predicted_indices = probabilities.argmax(dim=-1).cpu().numpy()

# Translate predicted indices to your categories or labels
# You need a mapping from indices to actual labels
# Let's say this was your binarizer with classes learned during training
mlb = MultiLabelBinarizer()
mlb.fit(data['Recommended Matches'].str.split(', '))  # Assuming this is how you fit the binarizer

# The classes_ attribute holds the order of your labels
classes = mlb.classes_

# Creating the index_to_label mapping
index_to_label = {i: label for i, label in enumerate(classes)}

predicted_labels = [index_to_label[idx] for idx in predicted_indices]

# Add predictions to the original dataframe
data['Predicted Category'] = predicted_labels

# Save or process further as needed
data.to_csv('predictions.csv', index=False)

