### Preprocessing the dataset

In [7]:
# Clearing data directory for storing preprocessed data
import shutil 
shutil.rmtree('/home/ubuntu/Research/Topic_Modelling/SLM2_v2/data')

In [8]:
!python preprocess.py

Loading 20 Newsgroups dataset...
Tokenizing texts into sentences and words...
Tokenizing: 100%|█████████████████████████| 18846/18846 [05:44<00:00, 54.64it/s]
Building vocabularies...
Word Vocabulary Size: 28017
POS Vocabulary Size: 19
Rule Vocabulary Size: 4
Encoding texts...
Splitting data into train and test sets...
Saving processed data...
Preprocessing completed successfully.


### Loading the model 

In [9]:
# Import necessary modules
import torch
from model import get_model
from torchinfo import summary
import warnings
warnings.filterwarnings('ignore')

# Define model parameters
vocab_size = 30522        # Example vocab size (e.g., from BERT tokenizer)
pos_vocab_size = 50       # Number of unique POS tags
rule_vocab_size = 6       # Number of unique rules (including padding)
num_classes = 20          # Number of output classes
embed_dim = 100
pos_embed_dim = 25
rule_embed_dim = 25
fusion_dim = 128
max_word_len = 128        # Maximum number of tokens per sentence
max_sent_len = 32         # Maximum number of sentences per document
max_rules_per_word = 3

# Instantiate the model
model = get_model(
    vocab_size=vocab_size,
    pos_vocab_size=pos_vocab_size,
    rule_vocab_size=rule_vocab_size,
    num_classes=num_classes,
    embed_dim=embed_dim,
    pos_embed_dim=pos_embed_dim,
    rule_embed_dim=rule_embed_dim,
    fusion_dim=fusion_dim,
    max_word_len=max_word_len,
    max_sent_len=max_sent_len,
    max_rules_per_word=max_rules_per_word
)

# Create dummy inputs to test the model
batch_size = 2
num_sentences = max_sent_len
seq_length = max_word_len

# Dummy input tensors
input_ids = torch.randint(0, vocab_size, (batch_size, num_sentences, seq_length))
attention_mask = torch.ones(batch_size, num_sentences, seq_length, dtype=torch.long)
pos_tags = torch.randint(0, pos_vocab_size, (batch_size, num_sentences, seq_length))
rules = torch.randint(0, rule_vocab_size, (batch_size, num_sentences, seq_length, max_rules_per_word))
sentence_masks = torch.ones(batch_size, num_sentences, dtype=torch.long)
# Move model and data to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
pos_tags = pos_tags.to(device)
rules = rules.to(device)
sentence_masks = sentence_masks.to(device)
# Forward pass
outputs = model(input_ids, attention_mask, pos_tags, rules, sentence_masks)
print("Model output shape:", outputs.shape)
# Print model summary
summary(model, input_data=(input_ids, attention_mask, pos_tags, rules, sentence_masks), depth=4)


Model output shape: torch.Size([2, 20])


Layer (type:depth-idx)                             Output Shape              Param #
HANTransformer                                     [2, 20]                   --
├─Embedding: 1-1                                   [64, 128, 100]            3,052,200
├─Embedding: 1-2                                   [64, 128, 25]             1,250
├─Embedding: 1-3                                   [64, 384, 25]             150
├─WordEncoder: 1-4                                 [64, 128, 128]            --
│    └─FusionLayer: 2-1                            [64, 128, 128]            --
│    │    └─Linear: 3-1                            [64, 128, 150]            22,650
│    │    └─Sigmoid: 3-2                           [64, 128, 150]            --
│    │    └─Linear: 3-3                            [64, 128, 128]            19,328
│    └─PositionalEncoding: 2-2                     [64, 128, 128]            --
│    │    └─Embedding: 3-4                         [1, 128, 128]             16,384
│    └─Dropo

### Training the SLM

In [None]:
# Clearing model directory for storing checkpoints 
import shutil 
shutil.rmtree('/home/ubuntu/Research/Topic_Modelling/SLM2_v2/model')

In [11]:
!python train.py 

Loading preprocessed data...
Creating datasets and dataloaders...
Initializing the model...
Starting training...

Epoch 1/50
Training: 100%|███████████████████████████████| 472/472 [00:35<00:00, 13.23it/s]
Train Loss: 2.7484 | Train Acc: 0.1100
Evaluating: 100%|█████████████████████████████| 118/118 [00:03<00:00, 38.44it/s]
Test Loss: 2.4701 | Test Acc: 0.1679
Best model saved.

Epoch 2/50
Training: 100%|███████████████████████████████| 472/472 [00:34<00:00, 13.49it/s]
Train Loss: 2.2740 | Train Acc: 0.2228
Evaluating: 100%|█████████████████████████████| 118/118 [00:03<00:00, 38.48it/s]
Test Loss: 2.1707 | Test Acc: 0.2703
Best model saved.

Epoch 3/50
Training: 100%|███████████████████████████████| 472/472 [00:35<00:00, 13.48it/s]
Train Loss: 1.8951 | Train Acc: 0.3423
Evaluating: 100%|█████████████████████████████| 118/118 [00:03<00:00, 38.43it/s]
Test Loss: 1.8794 | Test Acc: 0.3515
Best model saved.

Epoch 4/50
Training: 100%|███████████████████████████████| 472/472 [00:34<00:00, 1

### Evaluating the model

In [13]:
!python evaluate.py

Loading preprocessed data...
Creating dataset and dataloader...
Initializing the model...
  model.load_state_dict(torch.load(model_path, map_location=DEVICE))
Model loaded successfully.
Evaluating the model on the test set...
Evaluating: 100%|█████████████████████████████| 118/118 [00:03<00:00, 32.28it/s]

Test Loss: 1.9363 | Test Accuracy: 0.6228

Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.49      0.51      0.50       151
           comp.graphics       0.54      0.54      0.54       202
 comp.os.ms-windows.misc       0.58      0.59      0.59       195
comp.sys.ibm.pc.hardware       0.58      0.54      0.56       183
   comp.sys.mac.hardware       0.71      0.63      0.67       205
          comp.windows.x       0.73      0.77      0.75       215
            misc.forsale       0.65      0.65      0.65       193
               rec.autos       0.42      0.68      0.52       196
         rec.motorcycles       

### Predictions 
##### Run in Terminal

In [14]:
!python predict.py

  model.load_state_dict(torch.load(model_path, map_location=DEVICE))
Model loaded successfully.

Enter text to classify (type 'exit' to quit):

>> ^C
Traceback (most recent call last):
  File "/home/ubuntu/Research/Topic_Modelling/SLM2_v2/predict.py", line 254, in <module>
    main()
  File "/home/ubuntu/Research/Topic_Modelling/SLM2_v2/predict.py", line 223, in main
    user_input = input(">> ")
                 ^^^^^^^^^^^^
KeyboardInterrupt
