# Using Sparse auto-encoder to do TF-IDF

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Define Sparse Autoencoder
class SparseAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, sparsity_lambda=1e-3):
        super(SparseAutoencoder, self).__init__()
        # Encoder layer: Reduces input dimensions
        self.encoder = nn.Linear(input_dim, hidden_dim)
        # Decoder layer: Reconstructs input from encoded representation
        self.decoder = nn.Linear(hidden_dim, input_dim)
        self.sparsity_lambda = sparsity_lambda  # Regularization strength
        
    def forward(self, x):
        # Encode input using ReLU activation
        encoded = torch.relu(self.encoder(x))
        # Decode back to input space
        decoded = self.decoder(encoded)
        return encoded, decoded
    
    def sparsity_loss(self, encoded):
        # Compute average activation of hidden units
        rho_hat = torch.mean(encoded, dim=0)
        rho = 0.05  # Desired average activation
        # KL divergence for sparsity constraint
        return self.sparsity_lambda * torch.sum(rho * torch.log(rho / rho_hat) + (1 - rho) * torch.log((1 - rho) / (1 - rho_hat)))

In [3]:
# Hyperparameters
input_dim = 100  # Adjust based on TF-IDF vector size
hidden_dim = 20   # Hidden layer size
lr = 0.01        # Learning rate
epochs = 100     # Number of training iterations

In [29]:
# Sample text data
corpus = [
    "Machine learning is amazing",
    "Deep learning is a subset of machine learning",
    "Neural networks are powerful models",
    "Autoencoders are used for unsupervised learning",
    "Sparse autoencoders enforce sparsity constraints",
    "Artificial intelligence is transforming industries",
    "Data science involves statistics and programming",
    "Natural language processing enables machines to understand text",
    "Reinforcement learning is useful for decision making",
    "Gradient descent is an optimization algorithm",
    "Feature engineering improves machine learning models",
    "Convolutional neural networks are used for image processing",
    "Supervised learning uses labeled data",
    "Unsupervised learning finds hidden patterns in data",
    "Generative adversarial networks create new data",
    "Support vector machines are powerful classifiers",
    "Random forests are robust machine learning models",
    "XGBoost is a popular gradient boosting algorithm",
    "Overfitting occurs when a model learns noise",
    "Underfitting happens when a model is too simple",
    "Cross-validation helps assess model performance",
    "Dimensionality reduction reduces the number of features",
    "Principal component analysis is used for dimensionality reduction",
    "K-means clustering groups data points into clusters",
    "K-nearest neighbors is a simple classification algorithm",
    "Decision trees are interpretable machine learning models",
    "Bagging improves model stability by training multiple models",
    "Boosting combines weak learners to create a strong learner",
    "Hyperparameter tuning improves model performance",
    "Data normalization scales features to a similar range",
    "One-hot encoding is used for categorical variables",
    "Transfer learning leverages pre-trained models",
    "Recurrent neural networks are used for sequence data",
    "Long short-term memory networks handle long-range dependencies",
    "Word embeddings represent words as vectors",
    "Attention mechanisms help models focus on important parts of input",
    "BERT is a transformer-based model for NLP",
    "GPT is a large language model for text generation",
    "Explainable AI aims to make models more transparent",
    "Clustering is an unsupervised learning technique",
    "Model evaluation metrics include accuracy, precision, and recall",
    "Data preprocessing is crucial for machine learning success",
    "Outliers can negatively affect model performance",
    "Ensemble methods combine predictions from multiple models",
    "SVMs can handle both classification and regression tasks",
    "Reinforcement learning agents learn by interacting with environments",
    "Neural architecture search automates the design of neural networks",
    "Model interpretability helps to understand decision-making",
    "Self-supervised learning generates labels from the data itself",
    "Data augmentation creates additional training data",
    "Convolutional layers detect local patterns in images",
    "Batch normalization improves training speed and stability"
]

In [30]:
# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=input_dim)
X_tfidf = vectorizer.fit_transform(corpus).toarray()
X = torch.tensor(X_tfidf, dtype=torch.float32)

In [31]:
# Initialize model, loss function, and optimizer
model = SparseAutoencoder(input_dim, hidden_dim)
criterion = nn.MSELoss()  # Mean Squared Error loss for reconstruction
optimizer = optim.Adam(model.parameters(), lr=lr)

In [32]:
# Training Loop
for epoch in range(epochs):
    optimizer.zero_grad()  # Reset gradients
    encoded, decoded = model(X)  # Forward pass
    X = X.view(-1, input_dim)  # Ensure correct shape
    loss = criterion(decoded, X) + model.sparsity_loss(encoded)  # Compute loss with sparsity constraint
    loss.backward()  # Backpropagation
    optimizer.step()  # Update weights
    
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')  # Print progress

Epoch 0, Loss: 0.0289
Epoch 10, Loss: 0.0092
Epoch 20, Loss: 0.0076
Epoch 30, Loss: 0.0064
Epoch 40, Loss: 0.0054
Epoch 50, Loss: 0.0049
Epoch 60, Loss: 0.0045
Epoch 70, Loss: 0.0043
Epoch 80, Loss: 0.0041
Epoch 90, Loss: 0.0040
