# Transformer Based Model for Chat-Bot
Using Pytorch
  

In [32]:
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

In [33]:
!nvidia-smi

Thu Mar 27 00:46:58 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.36                 Driver Version: 566.36         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   41C    P8              1W /   60W |    1969MiB /   8188MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [34]:
import warnings
warnings.filterwarnings('ignore')

### Import Statements

In [35]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
import re

### Set Huperparameters for model

- MAX_LEN = 40: The maximum length of input/output sequences (in tokens) for the model.
- BATCH_SIZE = 64: The number of training samples processed in one forward/backward pass.
- NUM_HEADS = 8: The number of attention heads in the multi-head attention mechanism.
- D_MODEL = 512: The dimensionality of the model’s hidden layer representations.
- FFN_UNITS = 2048: The number of units in the feed-forward neural network after each attention layer.
- DROPOUT = 0.1: The fraction of units to drop during training to prevent overfitting.
- NUM_LAYERS = 4: The number of layers in the encoder and decoder of the Transformer.
- EPOCHS = 300: The number of full passes through the training dataset during training.
- VOCAB_SIZE = 8000: The number of unique tokens in the model's vocabulary.


In [36]:
# Hyperparameters
MAX_LEN = 40
BATCH_SIZE = 64
NUM_HEADS = 8
D_MODEL = 512
FFN_UNITS = 2048
DROPOUT = 0.1
NUM_LAYERS = 4
EPOCHS = 110
VOCAB_SIZE = 8000

#### Setup decide for training

In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


#### load CSV file

In [38]:
file_path = 'c:/Users/sabhi/Downloads/Cleaned_Wyckoff_QA_Dataset.csv'
data = pd.read_csv(file_path)
data.columns = data.columns.str.strip()

In [39]:
print(data.columns.tolist())

['Question', 'Answer']


In [40]:
questions = data['Question'].astype(str).tolist()
answers = data['Answer'].astype(str).tolist()

In [41]:
print(questions[1])
print(answers[1])

What is an Upthrust (UT)?
An Upthrust is a false breakout to the upside during distribution, designed to trap breakout traders before the price falls.


### Custom Tokenizer to keepp track of vocab and word to index

In [42]:
class CustomTokenizer:
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
        self.word2idx = {"<pad>": 0, "<start>": 1, "<end>": 2, "<unk>": 3}
        self.idx2word = {0: "<pad>", 1: "<start>", 2: "<end>", 3: "<unk>"}
        self.word_count = {}

    def fit_on_texts(self, texts):
        for text in texts:
            for word in text.split():
                self.word_count[word] = self.word_count.get(word, 0) + 1
        sorted_vocab = sorted(self.word_count.items(), key=lambda x: x[1], reverse=True)[:self.vocab_size - 4]
        for idx, (word, _) in enumerate(sorted_vocab, start=4):
            self.word2idx[word] = idx
            self.idx2word[idx] = word

    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            seq = [self.word2idx.get(word, self.word2idx["<unk>"]) for word in text.split()]
            sequences.append(seq)
        return sequences

    def sequences_to_texts(self, sequences):
        texts = []
        for seq in sequences:
            text = " ".join([self.idx2word.get(idx, "<unk>") for idx in seq])
            texts.append(text)
        return texts


In [43]:
tokenizer = CustomTokenizer(VOCAB_SIZE)
tokenizer.fit_on_texts(questions + answers)

### Initialize the dataset by converting the questions and answers to sequences of integers.

In [44]:
class ChatDataset(Dataset):
    def __init__(self, questions, answers, tokenizer, max_len):
        self.questions = tokenizer.texts_to_sequences(questions)
        self.answers = tokenizer.texts_to_sequences(answers)
        self.max_len = max_len

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]

        question = [1] + question[:self.max_len - 2] + [2]
        answer = [1] + answer[:self.max_len - 2] + [2]

        question = question + [0] * (self.max_len - len(question))
        answer = answer + [0] * (self.max_len - len(answer))

        return torch.tensor(question), torch.tensor(answer)

### Compute the scaled dot-product attention.

In [45]:
def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = torch.matmul(q, k.transpose(-2, -1))
    dk = q.size(-1)
    scaled_attention_logits = matmul_qk / torch.sqrt(torch.tensor(dk, dtype=torch.float32, device=q.device))
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)
    attention_weights = F.softmax(scaled_attention_logits, dim=-1)
    output = torch.matmul(attention_weights, v)
    return output

### Multi Head Atention 

In [46]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.depth = d_model // num_heads

        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)

        self.dense = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask):
        batch_size = q.size(0)
        q = self.wq(q).view(batch_size, -1, self.num_heads, self.depth).transpose(1, 2)
        k = self.wk(k).view(batch_size, -1, self.num_heads, self.depth).transpose(1, 2)
        v = self.wv(v).view(batch_size, -1, self.num_heads, self.depth).transpose(1, 2)

        attention = scaled_dot_product_attention(q, k, v, mask)
        attention = attention.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.dense(attention)

### Initialize the feed-forward network with two linear layers, ReLU activation, and dropout regularization

In [47]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, d_model, ffn_units, dropout_rate):
        super(FeedForwardNetwork, self).__init__()
        self.linear1 = nn.Linear(d_model, ffn_units)
        self.linear2 = nn.Linear(ffn_units, d_model)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x = self.linear1(x)
        x = F.relu(x)
        x = self.dropout(x)
        return self.linear2(x)

### Encoder: Multi Head Attention with Layer normalization

In [48]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, ffn_units, dropout_rate):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForwardNetwork(d_model, ffn_units, dropout_rate)
        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x, mask):
        attn_output = self.attention(x, x, x, mask)
        out1 = self.layernorm1(x + self.dropout(attn_output))
        ffn_output = self.ffn(out1)
        return self.layernorm2(out1 + self.dropout(ffn_output))

### Decoder: With look ahead mask and Cross attention

In [49]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, ffn_units, dropout_rate):
        super(DecoderLayer, self).__init__()
        self.attention1 = MultiHeadAttention(d_model, num_heads)
        self.attention2 = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForwardNetwork(d_model, ffn_units, dropout_rate)
        self.layernorm1 = nn.LayerNorm(d_model)
        self.layernorm2 = nn.LayerNorm(d_model)
        self.layernorm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x, enc_output, look_ahead_mask, padding_mask):
        attn1 = self.attention1(x, x, x, look_ahead_mask)
        out1 = self.layernorm1(x + self.dropout(attn1))

        attn2 = self.attention2(out1, enc_output, enc_output, padding_mask)
        out2 = self.layernorm2(out1 + self.dropout(attn2))

        ffn_output = self.ffn(out2)
        return self.layernorm3(out2 + self.dropout(ffn_output))

In [50]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        return x + self.encoding[:, :x.size(1), :].to(x.device)

In [51]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, ffn_units, num_layers, dropout_rate, max_len):
        super(Transformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_len)

        # Encoder
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, ffn_units, dropout_rate)
            for _ in range(num_layers)
        ])

        # Decoder
        self.decoder_layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, ffn_units, dropout_rate)
            for _ in range(num_layers)
        ])

        self.fc = nn.Linear(d_model, vocab_size)

    def create_look_ahead_mask(self, size):
        mask = torch.triu(torch.ones(size, size), diagonal=1)
        return mask == 1

    def forward(self, encoder_input, decoder_input, encoder_mask=None, decoder_mask=None):
        # Encoder
        encoder_embedded = self.embedding(encoder_input)
        encoder_embedded = self.positional_encoding(encoder_embedded)

        encoder_output = encoder_embedded
        for layer in self.encoder_layers:
            encoder_output = layer(encoder_output, encoder_mask)

        # Decoder
        decoder_embedded = self.embedding(decoder_input)
        decoder_embedded = self.positional_encoding(decoder_embedded)

        look_ahead_mask = self.create_look_ahead_mask(decoder_input.size(1)).to(decoder_input.device)

        decoder_output = decoder_embedded
        for layer in self.decoder_layers:
            decoder_output = layer(decoder_output, encoder_output, look_ahead_mask, encoder_mask)

        return self.fc(decoder_output)

In [52]:
model = Transformer(VOCAB_SIZE, D_MODEL, NUM_HEADS, FFN_UNITS, NUM_LAYERS, DROPOUT, MAX_LEN)
model = model.to(device)

In [53]:
dataset = ChatDataset(questions, answers, tokenizer, MAX_LEN)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

### Optimizer and loss functions

In [54]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

### Model Training

In [55]:
# Training Loop with Loss and Accuracy
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    total_accuracy = 0
    total_tokens = 0

    for questions_batch, answers_batch in dataloader:
        questions_batch = questions_batch.to(device)
        decoder_input = answers_batch[:, :-1].to(device)  # Input for decoder
        target = answers_batch[:, 1:].to(device)  # Target for training

        optimizer.zero_grad()

        output = model(questions_batch, decoder_input)
        loss = criterion(output.view(-1, VOCAB_SIZE), target.view(-1))

        # Backpropagation
        loss.backward()
        optimizer.step()

        # Calculate accuracy
        predicted_tokens = torch.argmax(output, dim=-1)
        correct_tokens = (predicted_tokens == target).sum().item()
        total_accuracy += correct_tokens
        total_tokens += target.numel()

        total_loss += loss.item()

    epoch_loss = total_loss / len(dataloader)
    epoch_accuracy = total_accuracy / total_tokens

    print(f"Epoch {epoch + 1}/{EPOCHS}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")

Epoch 1/110, Loss: 4.3153, Accuracy: 0.5216
Epoch 2/110, Loss: 3.3063, Accuracy: 0.5690
Epoch 3/110, Loss: 3.0019, Accuracy: 0.5889
Epoch 4/110, Loss: 2.8196, Accuracy: 0.5959
Epoch 5/110, Loss: 2.7036, Accuracy: 0.6048
Epoch 6/110, Loss: 2.6044, Accuracy: 0.6143
Epoch 7/110, Loss: 2.4919, Accuracy: 0.6254
Epoch 8/110, Loss: 2.3878, Accuracy: 0.6333
Epoch 9/110, Loss: 2.2898, Accuracy: 0.6414
Epoch 10/110, Loss: 2.1843, Accuracy: 0.6520
Epoch 11/110, Loss: 2.0985, Accuracy: 0.6637
Epoch 12/110, Loss: 2.0063, Accuracy: 0.6741
Epoch 13/110, Loss: 1.9251, Accuracy: 0.6849
Epoch 14/110, Loss: 1.8394, Accuracy: 0.6964
Epoch 15/110, Loss: 1.7626, Accuracy: 0.7051
Epoch 16/110, Loss: 1.6848, Accuracy: 0.7151
Epoch 17/110, Loss: 1.6059, Accuracy: 0.7235
Epoch 18/110, Loss: 1.5341, Accuracy: 0.7341
Epoch 19/110, Loss: 1.4580, Accuracy: 0.7426
Epoch 20/110, Loss: 1.3835, Accuracy: 0.7523
Epoch 21/110, Loss: 1.3189, Accuracy: 0.7609
Epoch 22/110, Loss: 1.2410, Accuracy: 0.7723
Epoch 23/110, Loss:

### Export the Model

In [57]:
torch.save(model.state_dict(), 'transformer_chatbot_gpu_deco_1.pth')

### Load and Test the model

In [58]:
# Load the Trained Model
model.load_state_dict(torch.load("transformer_chatbot_gpu_deco_1.pth"))
model.eval()

# Chat Function
def chat_response(question, tokenizer, model, max_len=40):
    model.eval()
    question_seq = tokenizer.texts_to_sequences([question])[0]
    question_seq = [1] + question_seq[:max_len - 2] + [2]  # Add <start> and <end> tokens
    question_seq = question_seq + [0] * (max_len - len(question_seq))  # Pad to max_len
    question_tensor = torch.tensor([question_seq]).to(device)

    decoder_input = torch.tensor([[1]]).to(device)  # Start token
    response = []

    for _ in range(max_len):
        with torch.no_grad():
            output = model(question_tensor, decoder_input)
        predicted_id = torch.argmax(output[:, -1, :], dim=-1).item()
        if predicted_id == 2:  # End token
            break
        response.append(predicted_id)
        decoder_input = torch.cat([decoder_input, torch.tensor([[predicted_id]]).to(device)], dim=-1)

    return tokenizer.sequences_to_texts([response])[0].replace("<start>", "").replace("<end>", "").strip()

# Interactive Chat Loop
while True:
    user_input = input("You: ")
    if user_input.lower() in ["exit", "quit"]:
        print("Exiting chatbot. Goodbye!")
        break
    bot_response = chat_response(user_input, tokenizer, model)
    print(f"Bot: {bot_response}")


Bot: If price breaks below support, it quickly or breaks resistance to trap behavior.
Bot: A failed rally during Distribution or price breaks key levels that quickly or strong supply has been absorbed.
Bot: If buyers step in that the structure may signal in Accumulation or the CO may soon often a
Bot: An upthrust breaks resistance and reverses on structure support with a structure precedes to trap
Bot: This law states that price movement should be proportional to volume, a divergence signals a potential change.
Bot: The Wyckoff Method is a technical analysis approach focused on identifying the intentions of large institutional investors through price and volume analysis.
Bot: Accumulation is a phase where large players buy assets at low prices, absorbing supply before driving the price higher.
Bot: If structure is unclear, tests fail repeatedly, and price movement suggests trend upward.
Bot: If structure is unclear, tests fail repeatedly, and price movement suggests trend upward.
Exiti