<a href="https://colab.research.google.com/github/Papa-Panda/random_thoughts/blob/main/ML_interivew_FeatureEngineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1

In [3]:
# 3 Feature hashing

In [14]:
import torch
import torch.nn as nn

# Example data
data = [
    {'feature1': 'cat', 'feature2': 'red'},
    {'feature1': 'dog', 'feature2': 'blue'},
    {'feature1': 'bird', 'feature2': 'green'},
]

# Define the hash function
def feature_hashing(text, num_buckets):
    return hash(text) % num_buckets

# Get unique feature values
feature1_values = set(sample['feature1'] for sample in data)
feature2_values = set(sample['feature2'] for sample in data)

# Define the number of hash buckets
num_buckets = 5

# Create a mapping of feature values to hash buckets
feature1_hash_map = {value: feature_hashing(value, num_buckets) for value in feature1_values}
feature2_hash_map = {value: feature_hashing(value, num_buckets) for value in feature2_values}

# Convert the data into hashed features
hashed_data = [
    {'feature1': feature1_hash_map[sample['feature1']], 'feature2': feature2_hash_map[sample['feature2']]}
    for sample in data
]

# Convert the data into PyTorch tensors
feature1_tensor = torch.tensor([sample['feature1'] for sample in hashed_data], dtype=torch.long)
feature2_tensor = torch.tensor([sample['feature2'] for sample in hashed_data], dtype=torch.long)

# Create a tensor of indices for the EmbeddingBag layer
indices = torch.tensor([0, 1, 2])  # Assuming three samples in the data

# Concatenate the tensors along the second dimension
input_tensor = torch.cat([feature1_tensor.unsqueeze(1), feature2_tensor.unsqueeze(1)], dim=1)

# Define the embedding layer
embedding_layer = nn.EmbeddingBag(num_buckets, embedding_dim=5, sparse=True)

# # Forward pass
# output = embedding_layer(input_tensor, indices)

# Print the result
print("Input Tensor:")
print(input_tensor)
# print("\nOutput Tensor:")
# print(output)

Input Tensor:
tensor([[4, 1],
        [4, 1],
        [1, 0]])


In [2]:
# Normalized Cross Entropy

In [4]:
import torch
import torch.nn.functional as F

class NormalizedCrossEntropyLoss(torch.nn.Module):
    def __init__(self, num_classes):
        super(NormalizedCrossEntropyLoss, self).__init__()
        self.num_classes = num_classes

    def forward(self, logits, targets):
        """
        :param logits: Raw scores from the model (before softmax)
        :param targets: Ground truth class indices
        :return: Normalized Cross Entropy loss
        """
        # Apply softmax to the logits
        probabilities = F.softmax(logits, dim=1)

        # Create a one-hot encoding of the targets
        one_hot_targets = F.one_hot(targets, num_classes=self.num_classes).float()

        # Calculate the log probabilities for the true classes
        log_probabilities = torch.log(probabilities + 1e-10)  # Adding a small epsilon to avoid numerical instability

        # Calculate the cross-entropy loss
        cross_entropy_loss = -torch.sum(log_probabilities * one_hot_targets, dim=1)

        # Calculate the normalization term
        normalization_term = -torch.sum(one_hot_targets * log_probabilities, dim=1)

        # Normalize the cross-entropy loss
        normalized_cross_entropy_loss = cross_entropy_loss / normalization_term

        # Take the mean over the batch
        mean_loss = torch.mean(normalized_cross_entropy_loss)

        return mean_loss

# Example usage:
num_classes = 5
logits = torch.randn(32, num_classes)  # Assuming a batch size of 32
targets = torch.randint(0, num_classes, (32,))

nce_loss = NormalizedCrossEntropyLoss(num_classes)
loss = nce_loss(logits, targets)

print("Normalized Cross Entropy Loss:", loss.item())


Normalized Cross Entropy Loss: 1.0
