In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:

from datasets import load_dataset
import numpy as np
import re
import copy
import matplotlib.pyplot as plt
from collections import Counter

class NexxtWordPredictor:
    def __init__(self, dataset_path=None, batch_size=32, max_sequence_length=50, padding_token=0, seed=42):
        self.dataset_path = dataset_path
        self.max_sequence_length = max_sequence_length
        self.padding_token = padding_token
        self.seed = seed
        self.input_sequences = []  # Corrected attribute name
        self.output_sequences = []  # Corrected attribute name
        self.source_vocab = {}  # Initialize an empty vocabulary

        if dataset_path:
            self.load_custom_dataset(dataset_path)
        else:
            print("No dataset provided. Please load a dataset using load_custom_dataset() method.")

    def load_custom_dataset(self, sentences):
        self.sentences = [sentence.strip() for sentence in sentences]
        self.build_vocab()
        self.generate_input_output_sequences()  # Add this line to generate input and output sequences


    def tokenize_sentence(self, sentence):
        return sentence.split()

    def build_vocab(self):
        words = []
        for sentence in self.sentences:
            words.extend(self.tokenize_sentence(sentence))

        # Count word frequencies
        word_counts = Counter(words)

        # Create a vocabulary with words that occur more than a certain threshold
        min_token_freq = 1  # Adjust as needed
        for word, count in word_counts.items():
            if count > min_token_freq and word not in self.source_vocab:
                self.source_vocab[word] = len(self.source_vocab)

    def pad_sequence(self, sequence):
        return sequence + [0] * (self.max_sequence_length - len(sequence))

    def generate_input_output_sequences(self):
        for sentence in self.sentences:
            tokens = self.tokenize_sentence(sentence)
            for i in range(1, len(tokens)):
                input_seq = tokens[:i]  # Input sequence is the tokens up to the current position
                output_word = tokens[i]  # Output word is the next word

                # Pad input sequence to the desired max_sequence_length
                input_seq = self.pad_sequence(input_seq)

                self.input_sequences.append(input_seq)
                self.output_sequences.append(output_word)

custom_dataset = [
    "This is Team Transformer from scratch we have made a model from scratch ."
]

next_word_predictor = NexxtWordPredictor(dataset_path=custom_dataset)
#next_word_predictor = NexxtWordPredictor()

# Access input_sequences and output_sequences
print(next_word_predictor)
input_sequences = np.array(next_word_predictor.input_sequences)
output_sequences = np.array(next_word_predictor.output_sequences)

<__main__.NexxtWordPredictor object at 0x7b187430ebf0>


In [None]:
print(input_sequences)

[['This' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
  '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
  '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['This' 'is' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
  '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
  '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['This' 'is' 'Team' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
  '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
  '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
 ['This' 'is' 'Team' 'Transformer' '0' '0' '0' '0' '0' '0' '0' '0' '0'
  '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
  '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
  '0']
 ['This' 'is' 'Team' 'Transformer' 'from' '0' '0' '0' '0' '0' '0' '0' '0'
  '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'