In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls drive/MyDrive/INFO\ 617/


'INFO 617_Group Project Test Set.xlsx'	 'INFO 617 Mental Health QA_LIWC.csv'
'INFO 617 Group Project Train Val.xlsx'  'INFO 617_QA_Question_Category.csv'
'INFO 617_Mental Health_QA.csv'


In [None]:
import pandas as pd
train_df = pd.read_excel('/content/drive/MyDrive/INFO 617/INFO 617 Group Project Train Val.xlsx')
test_df  = pd.read_excel('/content/drive/MyDrive/INFO 617/INFO 617_Group Project Test Set.xlsx')

train_texts  = train_df['Sentence'].tolist()
train_labels = train_df['Label'].tolist()
test_texts   = test_df['Sentence'].tolist()
test_labels  = test_df['Label'].tolist()

In [None]:
test_df.head()

Unnamed: 0,Sentence,Label
0,"I have calculated it for you, and your BMI is ...",DIAGNOISE
1,"In this case, you need to eat more carbohydrat...",TREAT
2,"Normally, you should consume 6-8 liang of stap...",TREAT
3,"With this, your weight should increase by 8 li...",EXPLAIN
4,"When dining outside or buying takeaway, you sh...",TREAT


In [None]:
train_df.head()

Unnamed: 0,Sentence,Label
0,"Hello,",GREET
1,the rubella virus IgM was detected at around ...,REPEAT
2,which has a 50% chance of causing intrauterine...,DIAGNOISE
3,Intrauterine infection can lead to risks such ...,EXPLAIN
4,Please note that these risks may or may not oc...,EXPLAIN


In [None]:
import torch

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels,
    test_size=0.1,
    stratify=train_labels,
    random_state=42
)


In [None]:
import numpy as np

def tokenize(texts):
    """Tokenize texts into lowercased word lists and build vocab."""
    tokenized_texts = [text.lower().split() for text in texts]
    vocab = set(token for tokens in tokenized_texts for token in tokens)
    word2idx = {"<PAD>": 0, "<UNK>": 1}
    for word in sorted(vocab):
        word2idx[word] = len(word2idx)
    max_len = max(len(tokens) for tokens in tokenized_texts)
    return tokenized_texts, word2idx, max_len

def encode(tokenized_texts, word2idx, max_len):
    """Convert tokens to index lists with padding and UNK fallback."""
    encoded = []
    unk_idx = word2idx.get("<UNK>", 1)
    pad_idx = word2idx.get("<PAD>", 0)

    for tokens in tokenized_texts:
        tokens = tokens[:max_len]
        ids = [word2idx.get(token, unk_idx) for token in tokens]
        if len(ids) < max_len:
            ids += [pad_idx] * (max_len - len(ids))
        encoded.append(ids)

    return np.array(encoded)


In [None]:
tokenized_train, word2idx, max_len = tokenize(train_texts)


In [None]:
tokenized_train, word2idx, max_len = tokenize(train_texts)
train_input_ids = encode(tokenized_train, word2idx, max_len)

tokenized_val = [txt.lower().split() for txt in val_texts]
val_input_ids = encode(tokenized_val, word2idx, max_len)

tokenized_test = [txt.lower().split() for txt in test_texts]
test_input_ids = encode(tokenized_test, word2idx, max_len)


In [None]:
import numpy as np

def load_pretrained_vectors(word2idx, vector_path):
    """Loads pre-trained word vectors and returns an embedding matrix."""
    embedding_dim = 300  # FastText crawl-300d
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), embedding_dim)).astype(np.float32)

    found = 0
    with open(vector_path, encoding="utf-8", errors="ignore") as f:
        next(f)  # skip header
        for line in f:
            values = line.rstrip().split(" ")
            word = values[0]
            vector = np.asarray(values[1:], dtype="float32")

            if word in word2idx:
                idx = word2idx[word]
                embeddings[idx] = vector
                found += 1

    print(f"✅ Loaded {found} vectors from {vector_path}")
    return embeddings


In [None]:
!mkdir -p fastText
!wget -O fastText/crawl-300d-2M.vec.zip https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
!unzip fastText/crawl-300d-2M.vec.zip -d fastText


--2025-04-26 16:16:14--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.226.210.15, 13.226.210.78, 13.226.210.25, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.226.210.15|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1523785255 (1.4G) [application/zip]
Saving to: ‘fastText/crawl-300d-2M.vec.zip’


2025-04-26 16:16:24 (136 MB/s) - ‘fastText/crawl-300d-2M.vec.zip’ saved [1523785255/1523785255]

Archive:  fastText/crawl-300d-2M.vec.zip
  inflating: fastText/crawl-300d-2M.vec  


In [None]:
FT_PATH = "fastText/crawl-300d-2M.vec"  # or your Drive path
embeddings_np = load_pretrained_vectors(word2idx, FT_PATH)
embeddings = torch.tensor(embeddings_np, dtype=torch.float32)


✅ Loaded 5053 vectors from fastText/crawl-300d-2M.vec


In [None]:
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler

# Label encoding
le = LabelEncoder()
train_lbls = le.fit_transform(train_labels)
val_lbls   = le.transform(val_labels)
test_lbls  = le.transform(test_labels)

# Input tensors
Xtr = torch.tensor(train_input_ids.tolist(), dtype=torch.long)
Xvl = torch.tensor(val_input_ids.tolist(),   dtype=torch.long)
Xte = torch.tensor(test_input_ids.tolist(),  dtype=torch.long)

Ytr = torch.tensor(train_lbls, dtype=torch.long)
Yvl = torch.tensor(val_lbls,   dtype=torch.long)
Yte = torch.tensor(test_lbls,  dtype=torch.long)

# DataLoaders
train_loader = DataLoader(TensorDataset(Xtr, Ytr), sampler=RandomSampler(Xtr), batch_size=50)
val_loader   = DataLoader(TensorDataset(Xvl, Yvl), sampler=SequentialSampler(Xvl), batch_size=50)
test_loader  = DataLoader(TensorDataset(Xte, Yte), sampler=SequentialSampler(Xte), batch_size=50)


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN_NLP(nn.Module):
    def __init__(self,
                 pretrained_embedding=None,
                 freeze_embedding=False,
                 vocab_size=None,
                 embed_dim=300,
                 filter_sizes=[3, 4, 5],
                 num_filters=[100, 100, 100],
                 num_classes=2,
                 dropout=0.5):
        super(CNN_NLP, self).__init__()

        if pretrained_embedding is not None:
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding, freeze=freeze_embedding)
            self.embed_dim = pretrained_embedding.shape[1]
        else:
            self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
            self.embed_dim = embed_dim

        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])

        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input_ids):
        x_embed = self.embedding(input_ids).float()     # (B, L, D)
        x_reshaped = x_embed.permute(0, 2, 1)           # (B, D, L)
        x_conv = [F.relu(conv(x_reshaped)) for conv in self.conv1d_list]
        x_pool = [F.max_pool1d(c, kernel_size=c.shape[2]).squeeze(2) for c in x_conv]
        x_cat = torch.cat(x_pool, dim=1)
        x_out = self.dropout(x_cat)
        logits = self.fc(x_out)
        return logits


In [None]:
import torch.optim as optim

def initilize_model(pretrained_embedding=None,
                    freeze_embedding=False,
                    vocab_size=None,
                    embed_dim=300,
                    filter_sizes=[3, 4, 5],
                    num_filters=[100, 100, 100],
                    num_classes=None,
                    dropout=0.5,
                    learning_rate=0.01):

    assert num_classes is not None, "You must specify num_classes."

    model = CNN_NLP(pretrained_embedding=pretrained_embedding,
                    freeze_embedding=freeze_embedding,
                    vocab_size=vocab_size,
                    embed_dim=embed_dim,
                    filter_sizes=filter_sizes,
                    num_filters=num_filters,
                    num_classes=num_classes,
                    dropout=dropout)

    model.to(device)
    optimizer = optim.Adadelta(model.parameters(), lr=learning_rate, rho=0.95)
    return model, optimizer


In [None]:
import time
import random

loss_fn = nn.CrossEntropyLoss()

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def evaluate(model, dataloader):
    model.eval()
    losses = []
    accuracies = []
    with torch.no_grad():
        for Xb, yb in dataloader:
            Xb, yb = Xb.to(device), yb.to(device)
            logits = model(Xb)
            loss = loss_fn(logits, yb)
            preds = torch.argmax(logits, dim=1)
            acc = (preds == yb).float().mean().item() * 100
            losses.append(loss.item())
            accuracies.append(acc)
    return np.mean(losses), np.mean(accuracies)

def train(model, optimizer, train_loader, val_loader, epochs=15):
    best_val_acc = 0
    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
    print("-" * 60)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        start_time = time.time()

        for Xb, yb in train_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            model.zero_grad()
            logits = model(Xb)
            loss = loss_fn(logits, yb)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        val_loss, val_acc = evaluate(model, val_loader)
        if val_acc > best_val_acc:
            best_val_acc = val_acc

        elapsed = time.time() - start_time
        print(f"{epoch + 1:^7} | {total_loss / len(train_loader):^12.4f} | {val_loss:^10.4f} | {val_acc:^9.2f} | {elapsed:^9.2f}")

    print(f"\n✅ Training complete! Best validation accuracy: {best_val_acc:.2f}%")



In [None]:
set_seed(42)

cnn_model, optimizer = initilize_model(
    pretrained_embedding=embeddings,
    freeze_embedding=False,
    vocab_size=len(word2idx),
    embed_dim=embeddings.shape[1],
    num_classes=len(le.classes_),     # from LabelEncoder
    learning_rate=0.25,
    dropout=0.5
)

train(cnn_model, optimizer, train_loader, val_loader, epochs=20)


 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |    1.9625    |   1.7976   |   45.56   |   20.10  
   2    |    1.5454    |   1.4172   |   55.48   |   18.75  
   3    |    1.2472    |   1.1691   |   60.15   |   19.07  
   4    |    1.0474    |   1.0407   |   63.26   |   19.36  
   5    |    0.9169    |   0.9476   |   68.96   |   19.77  
   6    |    0.8120    |   0.8876   |   70.30   |   19.76  
   7    |    0.7295    |   0.8512   |   72.52   |   19.63  
   8    |    0.6424    |   0.8420   |   73.85   |   19.64  
   9    |    0.5892    |   0.7989   |   74.52   |   19.61  
  10    |    0.5314    |   0.7868   |   72.96   |   19.72  
  11    |    0.4645    |   0.7741   |   74.30   |   19.67  
  12    |    0.4244    |   0.8173   |   74.52   |   19.59  
  13    |    0.3711    |   0.8017   |   74.07   |   19.59  
  14    |    0.3451    |   0.7874   |   74.96   |   19.71  
  15    |    0.3065    |   0.8329   |  

In [None]:
test_loss, test_acc = evaluate(cnn_model, test_loader)
print(f"\n Test Accuracy: {test_acc:.2f}% | Test Loss: {test_loss:.4f}")


 Test Accuracy: 73.58% | Test Loss: 0.7968


In [None]:
# CNN-rand (Random embeddings)
cnn_rand, optimizer_rand = initilize_model(
    pretrained_embedding=None,
    freeze_embedding=False,
    vocab_size=len(word2idx),
    embed_dim=300,
    num_classes=len(le.classes_),
    learning_rate=0.25,
    dropout=0.5
)
train(cnn_rand, optimizer_rand, train_loader, val_loader, epochs=20)
test_loss_rand, test_acc_rand = evaluate(cnn_rand, test_loader)
print(f"\n CNN-rand Test Accuracy: {test_acc_rand:.2f}% | Test Loss: {test_loss_rand:.4f}")

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |    1.7619    |   1.3530   |   54.37   |   19.56  
   2    |    1.1777    |   1.1900   |   58.74   |   19.95  
   3    |    0.8961    |   1.1011   |   66.96   |   19.74  
   4    |    0.7072    |   1.0308   |   70.07   |   19.53  
   5    |    0.5676    |   0.9632   |   71.85   |   19.53  
   6    |    0.4426    |   0.8991   |   70.07   |   19.66  
   7    |    0.3515    |   0.9641   |   70.96   |   19.70  
   8    |    0.2974    |   0.8964   |   70.30   |   19.64  
   9    |    0.2408    |   0.9686   |   70.07   |   19.63  
  10    |    0.2028    |   0.9733   |   72.07   |   19.65  
  11    |    0.1729    |   0.9854   |   70.07   |   19.64  
  12    |    0.1627    |   0.9513   |   71.19   |   19.61  
  13    |    0.1318    |   0.9434   |   71.63   |   19.58  
  14    |    0.1118    |   0.9910   |   70.74   |   19.64  
  15    |    0.1106    |   1.0183   |  

In [None]:
# CNN-static (Pre-trained embeddings, frozen)
cnn_static, optimizer_static = initilize_model(
    pretrained_embedding=embeddings,
    freeze_embedding=True,
    vocab_size=len(word2idx),
    embed_dim=embeddings.shape[1],
    num_classes=len(le.classes_),
    learning_rate=0.25,
    dropout=0.5
)
train(cnn_static, optimizer_static, train_loader, val_loader, epochs=20)
test_loss_static, test_acc_static = evaluate(cnn_static, test_loader)
print(f"\n CNN-static Test Accuracy: {test_acc_static:.2f}% | Test Loss: {test_loss_static:.4f}")

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |    1.9677    |   1.7965   |   45.78   |   19.49  
   2    |    1.5589    |   1.4440   |   54.37   |   19.64  
   3    |    1.2681    |   1.1828   |   59.70   |   19.47  
   4    |    1.0652    |   1.0652   |   64.52   |   19.39  
   5    |    0.9233    |   0.9714   |   68.07   |   19.40  
   6    |    0.8259    |   0.9009   |   72.30   |   19.47  
   7    |    0.7465    |   0.8623   |   72.74   |   19.48  
   8    |    0.6718    |   0.8718   |   72.30   |   19.43  
   9    |    0.6174    |   0.8329   |   75.41   |   19.43  
  10    |    0.5511    |   0.8091   |   74.52   |   19.46  
  11    |    0.4994    |   0.8022   |   74.96   |   19.46  
  12    |    0.4487    |   0.7888   |   75.19   |   19.40  
  13    |    0.4107    |   0.7915   |   74.07   |   19.40  
  14    |    0.3718    |   0.7863   |   76.30   |   19.44  
  15    |    0.3454    |   0.7451   |  

In [None]:
# CNN-nonstatic (Pre-trained embeddings, fine-tuned)
cnn_nonstatic, optimizer_nonstatic = initilize_model(
    pretrained_embedding=embeddings,
    freeze_embedding=False,
    vocab_size=len(word2idx),
    embed_dim=embeddings.shape[1],
    num_classes=len(le.classes_),
    learning_rate=0.25,
    dropout=0.5
)
train(cnn_nonstatic, optimizer_nonstatic, train_loader, val_loader, epochs=20)
test_loss_nonstatic, test_acc_nonstatic = evaluate(cnn_nonstatic, test_loader)
print(f"\n CNN-nonstatic Test Accuracy: {test_acc_nonstatic:.2f}% | Test Loss: {test_loss_nonstatic:.4f}")


 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |    1.9854    |   1.8136   |   42.22   |   19.61  
   2    |    1.5737    |   1.4372   |   55.48   |   19.95  
   3    |    1.2538    |   1.1710   |   60.59   |   19.73  
   4    |    1.0607    |   1.0355   |   63.93   |   19.51  
   5    |    0.9208    |   0.9280   |   71.41   |   19.53  
   6    |    0.8116    |   0.9377   |   68.96   |   19.65  
   7    |    0.7312    |   0.8597   |   72.30   |   19.70  
   8    |    0.6601    |   0.8414   |   73.19   |   19.64  
   9    |    0.5909    |   0.8163   |   74.30   |   19.63  
  10    |    0.5336    |   0.8289   |   72.30   |   19.65  
  11    |    0.4689    |   0.7951   |   74.07   |   19.59  
  12    |    0.4382    |   0.8373   |   73.85   |   19.60  
  13    |    0.3805    |   0.8045   |   73.41   |   19.62  
  14    |    0.3493    |   0.8018   |   73.85   |   19.61  
  15    |    0.3087    |   0.7685   |  

In [None]:
import pandas as pd

# Create the results dictionary
results = {
    'Model': [
        'Original CNN',
        'CNN-rand',
        'CNN-static',
        'CNN-nonstatic'
    ],
    'Test Accuracy (%)': [
        test_acc,
        test_acc_rand,
        test_acc_static,
        test_acc_nonstatic
    ],
    'Test Loss': [
        test_loss,
        test_loss_rand,
        test_loss_static,
        test_loss_nonstatic
    ]
}

# Create DataFrame
results_df = pd.DataFrame(results)

# Optional: Sort by Test Accuracy (highest first)
results_df = results_df.sort_values('Test Accuracy (%)', ascending=False)

# Display the table
print("\n📊 Summary of Model Performances:")
display(results_df)



📊 Summary of Model Performances:


Unnamed: 0,Model,Test Accuracy (%),Test Loss
0,Original CNN,73.578827,0.796784
2,CNN-static,73.020269,0.795674
3,CNN-nonstatic,72.686935,0.801812
1,CNN-rand,69.486485,1.0201
