In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re
import os
from tqdm import tqdm

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Load the data
def load_data(file_path):
    """Load and preprocess the Online_Retail.xlsx file"""
    try:
        # Try reading as Excel file
        df = pd.read_excel(file_path)
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        try:
            # If Excel reading fails, try as CSV
            df = pd.read_excel(file_path)
        except Exception as e2:
            print(f"Error reading CSV file: {e2}")
            raise

    print(f"Original data shape: {df.shape}")

    # Display the schema
    print("Dataset columns:")
    print(df.columns.tolist())

    # Drop rows with missing descriptions
    if 'Description' in df.columns:
        df = df.dropna(subset=['Description'])
    else:
        # Try to find a column that might contain product descriptions
        text_columns = [col for col in df.columns if any(x in col.lower() for x in ['desc', 'prod', 'item', 'name'])]
        if text_columns:
            print(f"Using '{text_columns[0]}' as the description column")
            df = df.rename(columns={text_columns[0]: 'Description'})
            df = df.dropna(subset=['Description'])
        else:
            raise ValueError("Could not find a description column in the dataset")

    # Clean descriptions
    df['Description'] = df['Description'].astype(str).apply(lambda x: clean_text(x))

    # Remove empty descriptions
    df = df[df['Description'].str.strip() != '']

    print(f"Processed data shape: {df.shape}")
    return df

def clean_text(text):
    """Clean the text data"""
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and extra spaces
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def create_categories(df, n_clusters=18):  # Reduced from 20 to 10 clusters
    """Create categories using clustering on TF-IDF features"""
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.cluster import MiniBatchKMeans  # Using MiniBatchKMeans instead of KMeans

    print("Creating categories using clustering...")

    # Create TF-IDF features with fewer features
    vectorizer = TfidfVectorizer(max_features=500, stop_words='english')  # Reduced from 1000 to 500
    tfidf_matrix = vectorizer.fit_transform(df['Description'])

    # Apply MiniBatchKMeans clustering (faster than regular KMeans)
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, batch_size=100)
    df['Category'] = kmeans.fit_predict(tfidf_matrix)

    # Convert numeric clusters to string categories
    df['Category'] = 'Category_' + df['Category'].astype(str)

    # Save the vectorizer and kmeans model for later use
    import joblib
    os.makedirs('models', exist_ok=True)
    joblib.dump(vectorizer, 'models/tfidf_vectorizer.pkl')
    joblib.dump(kmeans, 'models/kmeans_model.pkl')

    # Print category distribution
    print("Category distribution:")
    print(df['Category'].value_counts())

    return df, vectorizer, kmeans

# Create PyTorch dataset with reduced sequence length
class ProductDataset(Dataset):
    def __init__(self, descriptions, labels, tokenizer, max_length=64):  # Reduced from 128 to 64
        self.descriptions = descriptions
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, idx):
        text = self.descriptions[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

def train_model(df, num_epochs=2, batch_size=32, learning_rate=5e-5):  # Increased batch size, fewer epochs
    """Train a DistilBERT model for category classification"""
    # Encode categories
    label_encoder = LabelEncoder()
    df['label_encoded'] = label_encoder.fit_transform(df['Category'])

    # Save the label encoder
    import joblib
    os.makedirs('models', exist_ok=True)
    joblib.dump(label_encoder, 'models/label_encoder.pkl')

    # Sample a subset of data to speed up training even more (optional)
    if len(df) > 10000:
        df = df.sample(10000, random_state=42)
        print(f"Using a subset of {len(df)} samples for faster training")

    # Split data
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df['Description'].values,
        df['label_encoded'].values,
        test_size=0.2,
        random_state=42
    )

    # Load DistilBERT tokenizer and model (much smaller than BERT)
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=len(label_encoder.classes_)
    )

    # Save the tokenizer for later use
    tokenizer.save_pretrained('models/tokenizer')

    # Create datasets
    train_dataset = ProductDataset(train_texts, train_labels, tokenizer)
    val_dataset = ProductDataset(val_texts, val_labels, tokenizer)

    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size * 2  # Double batch size for validation
    )

    # Initialize optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    model = model.to(device)

    # Training loop
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Training
        model.train()
        train_loss = 0

        for batch in tqdm(train_loader, desc="Training"):
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            train_loss += loss.item()

            loss.backward()
            optimizer.step()

        avg_train_loss = train_loss / len(train_loader)
        print(f"Average training loss: {avg_train_loss:.4f}")

        # Validation
        model.eval()
        val_loss = 0
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                val_loss += loss.item()

                _, predicted = torch.max(outputs.logits, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        avg_val_loss = val_loss / len(val_loader)
        accuracy = correct / total
        print(f"Validation loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}")

        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            print("Saving best model...")
            model.save_pretrained('models/distilbert_model')

    print("Training complete!")
    return model, tokenizer, label_encoder

def main():
    file_path = 'Online ' \
    'Retail.xlsx'
    if not os.path.exists(file_path):
        print(f"Error: File '{file_path}' not found")
        return
    os.makedirs('models', exist_ok=True)
    df = load_data(file_path)
    df, vectorizer, kmeans = create_categories(df)
    model, tokenizer, label_encoder = train_model(df)

    print("Process completed successfully!")
    print(f"Models saved in 'models/' directory")
    sample_texts = df['Description'].sample(5).tolist()
    print("\nSample predictions:")

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()

    for text in sample_texts:
        encoding = tokenizer(
            text,
            add_special_tokens=True,
            max_length=64,  # Use the same max_length as training
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, 1)
            category = label_encoder.inverse_transform([predicted.item()])[0]

        print(f"Text: {text}")
        print(f"Predicted category: {category}\n")

if __name__ == "__main__":
    main()

Original data shape: (541909, 8)
Dataset columns:
['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']
Processed data shape: (540400, 8)
Creating categories using clustering...
Category distribution:
Category
Category_14    254706
Category_13     50216
Category_17     25354
Category_1      24989
Category_11     24109
Category_0      23910
Category_7      20033
Category_3      19421
Category_9      17618
Category_2      17545
Category_15     15059
Category_6       9941
Category_16      9844
Category_5       9676
Category_8       5183
Category_12      4736
Category_10      4188
Category_4       3872
Name: count, dtype: int64
Using a subset of 10000 samples for faster training


tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 5.65MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 13.4MB/s]
config.json: 100%|██████████| 483/483 [00:00<?, ?B/s] 
model.safetensors: 100%|██████████| 268M/268M [00:03<00:00, 88.0MB/s] 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cpu
Epoch 1/2


Training: 100%|██████████| 250/250 [17:51<00:00,  4.29s/it]


Average training loss: 0.6310


Validation: 100%|██████████| 32/32 [01:22<00:00,  2.57s/it]


Validation loss: 0.0793, Accuracy: 0.9855
Saving best model...
Epoch 2/2


Training: 100%|██████████| 250/250 [19:08<00:00,  4.59s/it]


Average training loss: 0.0491


Validation: 100%|██████████| 32/32 [01:37<00:00,  3.06s/it]


Validation loss: 0.0413, Accuracy: 0.9925
Saving best model...
Training complete!
Process completed successfully!
Models saved in 'models/' directory

Sample predictions:
Text: peg bag apples design
Predicted category: Category_17

Text: vintage paisley stationery set
Predicted category: Category_13

Text: water damaged
Predicted category: Category_6

Text: vintage doily travel sewing kit
Predicted category: Category_4

Text: french enamel candleholder
Predicted category: Category_14



In [None]:
import torch
import joblib
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import re

class ProductCategorizer:
    def __init__(self, model_dir='models'):
        """
        Initialize the product categorizer by loading all required models.

        Args:
            model_dir (str): Directory containing the saved models
        """
        # Load DistilBERT model and tokenizer
        self.model = DistilBertForSequenceClassification.from_pretrained(f"{model_dir}/distilbert_model")
        self.tokenizer = DistilBertTokenizer.from_pretrained(f"{model_dir}/tokenizer")

        # Load label encoder
        self.label_encoder = joblib.load(f"{model_dir}/label_encoder.pkl")

        # Load TF-IDF vectorizer and KMeans model (for alternative clustering-based categorization)
        self.vectorizer = joblib.load(f"{model_dir}/tfidf_vectorizer.pkl")
        self.kmeans = joblib.load(f"{model_dir}/kmeans_model.pkl")

        # Set device (GPU if available, else CPU)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = self.model.to(self.device)
        self.model.eval()  # Set model to evaluation mode

        print(f"Models loaded successfully. Using device: {self.device}")
        print(f"Available categories: {', '.join(self.label_encoder.classes_)}")

    def clean_text(self, text):
        """Clean the text data using the same preprocessing as during training"""
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and extra spaces
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def predict_category(self, product_description, method='transformer'):
        """
        Predict the category for a given product description.

        Args:
            product_description (str): The product description text
            method (str): Either 'transformer' to use the DistilBERT model or 'clustering' to use TF-IDF + KMeans

        Returns:
            tuple: (predicted_category, confidence_score) for transformer method
                   (predicted_category, None) for clustering method
        """
        # Clean text
        clean_description = self.clean_text(product_description)

        if method == 'transformer':
            # Tokenize input
            encoding = self.tokenizer(
                clean_description,
                add_special_tokens=True,
                max_length=64,
                return_token_type_ids=False,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt'
            )

            # Move tensors to device
            input_ids = encoding['input_ids'].to(self.device)
            attention_mask = encoding['attention_mask'].to(self.device)

            # Get prediction
            with torch.no_grad():
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                probabilities = torch.nn.functional.softmax(logits, dim=1)
                confidence, predicted_idx = torch.max(probabilities, 1)

                # Convert prediction to category label
                category = self.label_encoder.inverse_transform([predicted_idx.item()])[0]

                return category, confidence.item()

        elif method == 'clustering':
            # Transform text using TF-IDF
            text_vector = self.vectorizer.transform([clean_description])

            # Predict cluster
            cluster_id = self.kmeans.predict(text_vector)[0]
            category = f"Category_{cluster_id}"

            return category, None

        else:
            raise ValueError("Method must be either 'transformer' or 'clustering'")

    def batch_predict(self, descriptions_list, method='transformer'):
        """
        Predict categories for a list of product descriptions.

        Args:
            descriptions_list (list): List of product description strings
            method (str): Either 'transformer' or 'clustering'

        Returns:
            pandas.DataFrame: DataFrame with descriptions and their predicted categories
        """
        results = []

        for desc in descriptions_list:
            category, confidence = self.predict_category(desc, method)

            result = {
                'description': desc,
                'category': category
            }

            if confidence is not None:
                result['confidence'] = confidence

            results.append(result)

        return pd.DataFrame(results)


def main():
    # Initialize categorizer
    categorizer = ProductCategorizer()

    # Interactive mode
    print("\nProduct Category Prediction Tool")
    print("================================")
    print("Enter product descriptions to get category predictions.")
    print("Type 'quit', 'exit', or 'q' to exit.")
    print("Type 'file' to load descriptions from a file.")
    print("Type 'method' to switch between transformer and clustering methods.")

    method = 'transformer'
    print(f"Current method: {method}")

    while True:
        print("\nEnter a product description:")
        user_input = input("> ").strip()

        if user_input.lower() in ['quit', 'exit', 'q']:
            print("Exiting...")
            break

        elif user_input.lower() == 'method':
            if method == 'transformer':
                method = 'clustering'
            else:
                method = 'transformer'
            print(f"Switched to {method} method")
            continue

        elif user_input.lower() == 'file':
            file_path = input("Enter the path to your CSV or Excel file: ").strip()

            try:
                if file_path.endswith('.csv'):
                    df = pd.read_csv(file_path)
                elif file_path.endswith(('.xlsx', '.xls')):
                    df = pd.read_excel(file_path)
                else:
                    print("Unsupported file format. Please use CSV or Excel.")
                    continue

                # Try to find the description column
                desc_col = None
                for col in df.columns:
                    if any(x in col.lower() for x in ['desc', 'prod', 'item', 'name']):
                        desc_col = col
                        break

                if desc_col is None and len(df.columns) > 0:
                    desc_col = df.columns[0]  # Use first column as fallback

                if desc_col:
                    print(f"Using column '{desc_col}' for descriptions")
                    descriptions = df[desc_col].astype(str).tolist()

                    # Predict categories
                    results = categorizer.batch_predict(descriptions, method)

                    # Save results
                    output_path = file_path.rsplit('.', 1)[0] + '_categorized.' + file_path.rsplit('.', 1)[1]

                    # Combine with original dataframe
                    df['predicted_category'] = results['category']
                    if 'confidence' in results.columns:
                        df['confidence'] = results['confidence']

                    # Save to file
                    if output_path.endswith('.csv'):
                        df.to_csv(output_path, index=False)
                    else:
                        df.to_excel(output_path, index=False)

                    print(f"Results saved to {output_path}")
                else:
                    print("Could not find a suitable column for descriptions")

            except Exception as e:
                print(f"Error processing file: {e}")

            continue

        # Process single description
        try:
            category, confidence = categorizer.predict_category(user_input, method)

            print(f"Predicted category: {category}")
            if confidence is not None:
                print(f"Confidence: {confidence:.4f}")

        except Exception as e:
            print(f"Error: {e}")


if __name__ == "__main__":
    main()

Models loaded successfully. Using device: cpu
Available categories: Category_0, Category_1, Category_10, Category_11, Category_12, Category_13, Category_14, Category_15, Category_16, Category_17, Category_2, Category_3, Category_4, Category_5, Category_6, Category_7, Category_8, Category_9

Product Category Prediction Tool
Enter product descriptions to get category predictions.
Type 'quit', 'exit', or 'q' to exit.
Type 'file' to load descriptions from a file.
Type 'method' to switch between transformer and clustering methods.
Current method: transformer

Enter a product description:
Predicted category: Category_14
Confidence: 0.9489

Enter a product description:
Predicted category: Category_14
Confidence: 0.9973

Enter a product description:
Predicted category: Category_14
Confidence: 0.9489

Enter a product description:
Predicted category: Category_14
Confidence: 0.9797

Enter a product description:
Predicted category: Category_14
Confidence: 0.9990

Enter a product description:
Predi