<a href="https://colab.research.google.com/github/Vector42-rev/learning_to_make_machine_learn/blob/main/NN_SCB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Vocabulary and Corpus
vocabulary = {
    "this": 50, "is": 45, "a": 60, "test": 35, "sentence": 40,
    "machine": 25, "learning": 30, "natural": 20, "language": 22,
    "processing": 18, "deep": 15, "neural": 17, "network": 16,
    "model": 25, "data": 40, "science": 22, "python": 30,
    "programming": 20, "artificial": 15, "intelligence": 18
}

# bigram counts
bigram_counts = {
    ("this", "is"): 15, ("is", "a"): 16, ("a", "test"): 14,
    ("machine", "learning"): 10, ("natural", "language"): 8,
    ("deep", "neural"): 6, ("neural", "network"): 7,
    ("artificial", "intelligence"): 5, ("data", "science"): 9
}

# Feature Extraction Functions
def generate_unigram_features(sentence, vocab):
    """creating a unigram feature matrix where each word maps to its frequency."""
    words = sentence.split()
    seq_len = len(words)
    features = torch.zeros(seq_len, len(vocab))
    vocab_list = list(vocab.keys())

    for i, word in enumerate(words):
        # Handle out-of-vocabulary words
        if word in vocab:
            features[i, vocab_list.index(word)] = vocab.get(word, 0)
        else:
            # Add a small default frequency for unknown words
            features[i, -1] = 1  # Last column for unknown words

    return features

def generate_bigram_features(sentence, bigram_counts):
    """creating a bigram feature matrix from bigram counts."""
    words = sentence.split()
    seq_len = len(words)
    features = torch.zeros(seq_len, len(bigram_counts))
    bigram_list = list(bigram_counts.keys())

    for i in range(seq_len - 1):
        bigram = (words[i], words[i + 1])
        if bigram in bigram_list:
            features[i, bigram_list.index(bigram)] = bigram_counts.get(bigram, 0)

    return features

# Model
class SegmentationModel(nn.Module):
    def __init__(self, input_size):
        super(SegmentationModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, features):
        return self.fc(features)

# Training Preparation
def prepare_model():
    # Vocabulary modifications
    vocab = vocabulary.copy()
    vocab['<UNK>'] = 1

    # Calculate total feature size
    vocab_size = len(vocab)
    bigram_size = len(bigram_counts)
    total_feature_size = vocab_size + bigram_size

    # Model Initialization
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SegmentationModel(input_size=total_feature_size).to(device)

    # Training
    corpus = [
        "this is a test",
        "this is a sentence",
        "machine learning is good",
        "natural language processing is complex",
        "deep neural network model",
        "artificial intelligence advances quickly",
        "data science with python"
    ]

    data = [(sentence.replace(" ", ""), sentence) for sentence in corpus]

    train_data = data[:5]

    # Dataset and DataLoader
    class MyDataset(Dataset):
        def __init__(self, data, vocab, bigram_counts):
            self.data = data
            self.vocab = vocab
            self.bigram_counts = bigram_counts
            self.vocab_list = list(self.vocab.keys())
            self.bigram_list = list(bigram_counts.keys())

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            concatenated, sentence = self.data[idx]
            words = sentence.split()

            unigram_features = generate_unigram_features(sentence, self.vocab)
            bigram_features = generate_bigram_features(sentence, self.bigram_counts)

            target = torch.ones(len(words), 1)

            return {
                "features": torch.cat([unigram_features, bigram_features], dim=1),
                "target": target,
                "concatenated": concatenated
            }

    train_dataset = MyDataset(train_data, vocab, bigram_counts)
    train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

    # Training Loop
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(10):
        model.train()
        for batch in train_dataloader:
            features = batch["features"].float().to(device)
            target = batch["target"].float().to(device)

            optimizer.zero_grad()
            output = model(features)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

    return model, vocab, bigram_counts, device

# Segmentation Function
def segment_sentence(sentence, model, vocab, bigram_counts, device, threshold=0.5):
    # Remove spaces from the input sentence
    concatenated = sentence.replace(" ", "")

    # Prepare model for inference
    model.eval()

    # Prepare features
    unigram_features = generate_unigram_features(sentence, vocab)
    bigram_features = generate_bigram_features(sentence, bigram_counts)
    features = torch.cat([unigram_features, bigram_features], dim=1).float().to(device)

    # Get predictions
    with torch.no_grad():
        probabilities = model(features).cpu().numpy().flatten()

    # Determine segmentation points
    segmentation_points = [0]
    for i, prob in enumerate(probabilities):
        if prob > threshold:
            segmentation_points.append(i + 1)

    # Reconstruct segmented sentence
    segmented_words = []
    start = 0
    for point in segmentation_points[1:]:
        segmented_words.append(concatenated[start:point])
        start = point

    # Add the last segment if needed
    if start < len(concatenated):
        segmented_words.append(concatenated[start:])

    return segmented_words

# Prepare the model
trained_model, vocab, bigram_counts, device = prepare_model()

# Example usage function
def predict_segmentation(input_sentence):
    # Segment the sentence
    segmented_result = segment_sentence(
        input_sentence,
        trained_model,
        vocab,
        bigram_counts,
        device
    )

    print(f"Original Sentence: {input_sentence}")
    print(f"Segmented Result: {segmented_result}")
    return segmented_result

# Demonstration
if __name__ == "__main__":
    # Test some sentences
    test_sentences = [
        "thisisatest",
        "artificialintelligenceadvancesquickly",
        "datasciencewithpython"
    ]

    for sentence in test_sentences:
        predict_segmentation(sentence)

Original Sentence: thisisatest
Segmented Result: ['t', 'hisisatest']
Original Sentence: artificialintelligenceadvancesquickly
Segmented Result: ['a', 'rtificialintelligenceadvancesquickly']
Original Sentence: datasciencewithpython
Segmented Result: ['d', 'atasciencewithpython']


In [None]:
#THIS DOESNT WORK CORRECTLY


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np

# Define the LSTM-based model
class SegmentationLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(SegmentationLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out)
        out = self.sigmoid(out)
        return out

# Dataset class
class SegmentationDataset(Dataset):
    def __init__(self, sentences, vocab):
        self.sentences = sentences
        self.vocab = vocab

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        sentence_len = len(sentence)
        input_features = torch.zeros((sentence_len, len(self.vocab)))

        for i, char in enumerate(sentence):
            if char in self.vocab:
                input_features[i, self.vocab[char]] = 1
            else:
                input_features[i, self.vocab['<UNK>']] = 1  # Unknown character handling

        target = torch.zeros((sentence_len, 1))
        target[1:] = 1  # Label the position between the words as 1 for a break (adjust as needed)

        return input_features, target

# Prepare the vocabulary
vocab = {char: idx for idx, char in enumerate('abcdefghijklmnopqrstuvwxyz')}
vocab['<UNK>'] = len(vocab)  # Add a unique index for unknown characters

# Training data
corpus = [
    "thisisatest",
    "artificialintelligenceadvancesquickly",
    "datasciencewithpython"
]

# Model initialization
input_size = len(vocab)
hidden_size = 64
num_layers = 2
model = SegmentationLSTM(input_size, hidden_size, num_layers)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prepare the dataset and dataloader
dataset = SegmentationDataset(corpus, vocab)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Training setup
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(5):  # Reduce epochs if needed
    model.train()
    for inputs, targets in dataloader:
        inputs = inputs.float().to(device)
        targets = targets.float().to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Prediction function
def segment_sentence(sentence, model, vocab, device, threshold=0.5):
    model.eval()
    input_features = torch.zeros((len(sentence), len(vocab)))
    for i, char in enumerate(sentence):
        if char in vocab:
            input_features[i, vocab[char]] = 1
        else:
            input_features[i, vocab['<UNK>']] = 1

    input_features = input_features.unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(input_features).cpu().numpy().flatten()

    segmentation_points = [0] + [i+1 for i, prob in enumerate(output) if prob > threshold]
    segmented_words = []
    start = 0
    for point in segmentation_points[1:]:
        segmented_words.append(sentence[start:point])
        start = point

    if start < len(sentence):
        segmented_words.append(sentence[start:])

    return segmented_words

# Demonstration
if __name__ == "__main__":
    test_sentences = [
        "thisisatest",
        "artificialintelligenceadvancesquickly",
        "datasciencewithpython"
    ]

    for sentence in test_sentences:
        result = segment_sentence(sentence, model, vocab, device)
        print(f"Original: {sentence}")
        print(f"Segmented: {result}")


Epoch 1, Loss: 0.6578748822212219
Epoch 2, Loss: 0.6249921917915344
Epoch 3, Loss: 0.6026002168655396
Epoch 4, Loss: 0.5199169516563416
Epoch 5, Loss: 0.5057605504989624
Original: thisisatest
Segmented: ['t', 'h', 'i', 's', 'i', 's', 'a', 't', 'e', 's', 't']
Original: artificialintelligenceadvancesquickly
Segmented: ['a', 'r', 't', 'i', 'f', 'i', 'c', 'i', 'a', 'l', 'i', 'n', 't', 'e', 'l', 'l', 'i', 'g', 'e', 'n', 'c', 'e', 'a', 'd', 'v', 'a', 'n', 'c', 'e', 's', 'q', 'u', 'i', 'c', 'k', 'l', 'y']
Original: datasciencewithpython
Segmented: ['d', 'a', 't', 'a', 's', 'c', 'i', 'e', 'n', 'c', 'e', 'w', 'i', 't', 'h', 'p', 'y', 't', 'h', 'o', 'n']


In [None]:
#small corpus

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Example Vocabulary and Corpus
vocabulary = {"this": 10, "is": 15, "a": 20, "test": 8, "sentence": 5}
bigram_counts = {("this", "is"): 5, ("is", "a"): 6, ("a", "test"): 4}
corpus = ["this is a test", "this is a sentence"]

# Create Concatenated Data
data = [
    ("thisisatest", "this is a test"),
    ("thisisasentence", "this is a sentence"),
]

# Split data into training and testing sets
train_data = data[:1]  # Use the first example for training
test_data = data[1:]   # Use the second example for testing

# Feature Extraction Functions
def generate_unigram_features(sentence, vocab):
    """Create a unigram feature matrix where each word maps to its frequency."""
    words = sentence.split()
    seq_len = len(words)
    features = torch.zeros(seq_len, len(vocab))
    for i, word in enumerate(words):
        features[i, list(vocab.keys()).index(word)] = vocab.get(word, 0)  # Frequency from vocabulary
    return features

def generate_bigram_features(sentence, bigram_counts):
    """Create a bigram feature matrix from bigram counts."""
    words = sentence.split()
    seq_len = len(words)
    features = torch.zeros(seq_len, len(bigram_counts))
    bigram_list = list(bigram_counts.keys())
    for i in range(seq_len - 1):
        bigram = (words[i], words[i + 1])
        if bigram in bigram_list:
            features[i, bigram_list.index(bigram)] = bigram_counts.get(bigram, 0)
    return features

# Dataset
class MyDataset(Dataset):
    def __init__(self, data, vocab, bigram_counts):
        self.data = data
        self.vocab = vocab
        self.bigram_counts = bigram_counts
        self.vocab_list = list(vocab.keys())
        self.bigram_list = list(bigram_counts.keys())

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        concatenated, sentence = self.data[idx]
        words = sentence.split()

        unigram_features = generate_unigram_features(sentence, self.vocab)
        bigram_features = generate_bigram_features(sentence, self.bigram_counts)

        # Prepare target as binary labels for each word in the sentence
        target = torch.ones(len(words), 1)  # Changed to have shape [seq_len, 1]

        return {
            "features": torch.cat([unigram_features, bigram_features], dim=1),
            "target": target,
            "concatenated": concatenated
        }

# Model
class SegmentationModel(nn.Module):
    def __init__(self, input_size):
        super(SegmentationModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()  # Added sigmoid directly in the model
        )

    def forward(self, features):
        return self.fc(features)

# Hyperparameters
batch_size = 1
num_epochs = 5
learning_rate = 0.001

# Calculate total feature size
vocab_size = len(vocabulary)
bigram_size = len(bigram_counts)
total_feature_size = vocab_size + bigram_size

# Dataset and DataLoader
train_dataset = MyDataset(train_data, vocabulary, bigram_counts)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = MyDataset(test_data, vocabulary, bigram_counts)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Model, Loss, and Optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SegmentationModel(input_size=total_feature_size).to(device)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training Loop
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    epoch_loss = 0.0
    for batch in train_dataloader:
        features = batch["features"].float().to(device)
        target = batch["target"].float().to(device)

        optimizer.zero_grad()
        output = model(features)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(train_dataloader):.4f}")

# Testing Loop
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    total_loss = 0.0
    for batch in test_dataloader:
        features = batch["features"].float().to(device)
        target = batch["target"].float().to(device)

        output = model(features)
        loss = criterion(output, target)
        total_loss += loss.item()

    print(f"Test Loss: {total_loss/len(test_dataloader):.4f}")

Epoch [1/5], Loss: 1.7484
Epoch [2/5], Loss: 1.7189
Epoch [3/5], Loss: 1.6895
Epoch [4/5], Loss: 1.6603
Epoch [5/5], Loss: 1.6314
Test Loss: 1.4445
