In [1]:
import torch, torchdata, torchtext
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

In [2]:
torch.__version__, torchdata.__version__, torchtext.__version__

('2.2.2+cu121', '0.7.1', '0.17.2+cpu')

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
SEED = 1234 #change three times
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## 1. load dataset

In [5]:
df = pd.read_csv('data/pre-category/sample100percategoty.csv')
df.head(5)

Unnamed: 0,questionType,question
0,yes/no,Does this system filter gaillardia?
1,yes/no,Can you get a gas dryer instead of electric
2,yes/no,I have a 2010 maytag centennial washer w101409...
3,yes/no,CAN YOU JUST PLUG IT IN ONCE IT IS SEATED?
4,yes/no,does this replace a thermadore water filter ca...


In [6]:
# convert to lower case
df['question']  =  df['question'].apply(lambda x: x.lower() if isinstance(x, str) else x)

In [7]:
# tranform the text question type to integer
df['questionType']=df['questionType'].replace(['yes/no','open-ended'], [0,1])
df.head(5)

Unnamed: 0,questionType,question
0,0,does this system filter gaillardia?
1,0,can you get a gas dryer instead of electric
2,0,i have a 2010 maytag centennial washer w101409...
3,0,can you just plug it in once it is seated?
4,0,does this replace a thermadore water filter ca...


In [8]:
def data_cleaning(data):
    regex_s = re.sub("\\(.+?\\)|[\r\n|\n\r]|!", "", data)
    fin = " ".join(regex_s.split())
    return fin

In [9]:
df['question'] = df['question'].apply(data_cleaning)

In [10]:
train_df, val_df = train_test_split(df, test_size=0.15,stratify=df['questionType'], random_state=SEED)

In [11]:
train_df, test_df = train_test_split(train_df, test_size=0.15, stratify=train_df['questionType'],random_state=SEED)

In [12]:
train_df['questionType'].value_counts()

questionType
0    759
1    758
Name: count, dtype: int64

In [13]:
val_df['questionType'].value_counts()

questionType
1    158
0    157
Name: count, dtype: int64

In [14]:
test_df['questionType'].value_counts()

questionType
0    134
1    134
Name: count, dtype: int64

In [15]:
test_df['question'][1848]

'can you use this to get the temperature of wine through the bottle?'

## 2.Preprocessing

### Tokenizing

In [16]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
tokens    = tokenizer("What is the best product?")
tokens

['What', 'is', 'the', 'best', 'product', '?']

### Text to integers (numeral)

In [17]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(data):
    for data_sample in data:
        yield tokenizer(data_sample) 
        
vocab = build_vocab_from_iterator(yield_tokens(train_df['question']), specials = ['<unk>', '<pad>', '<bos>', '<eos>'])
vocab.set_default_index(vocab["<unk>"])

In [18]:
vocab(['here', 'it', 'is'])

[842, 11, 7]

In [19]:
mapping = vocab.get_itos()
mapping[0]

'<unk>'

In [20]:
len(vocab)

3905

## 3. Data loader

### FastText Embedding

In [21]:
from torchtext.vocab import FastText
fast_vectors = FastText(language='simple')

In [22]:
fast_embedding = fast_vectors.get_vecs_by_tokens(vocab.get_itos()).to(device)

In [23]:
fast_embedding.shape

torch.Size([3905, 300])

In [24]:
class DataWrap(Dataset):

    def __init__(self, dataframe):
        self.dataframe = dataframe
    
    def __len__(self):
        return len(self.dataframe)
        
    def __getitem__(self, idx):
        return self.dataframe.iloc[idx]

In [25]:
train_df.iloc[0]

questionType                                                    1
question        do these things cover the front and the back o...
Name: 381, dtype: object

In [26]:
train = DataWrap(train_df)
valid = DataWrap(val_df)
test = DataWrap(test_df)

In [27]:
text_pipeline  = lambda x: vocab(tokenizer(x)) #{hello world this is yt} => {'hello', 'world', 'this', 'is', 'yt'} => {4, 88, 11, 22, 6}

In [28]:
text_pipeline("I am currently teaching LSTM")

[0, 66, 2222, 0, 0]

In [29]:
from torch.utils.data   import DataLoader
from torch.nn.utils.rnn import pad_sequence

pad_idx = vocab['<pad>'] 

def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
    #criterion expects float labels
    return torch.tensor(label_list, dtype=torch.int64), pad_sequence(text_list, padding_value=pad_idx, batch_first=True)

In [30]:
batch_size = 64

train_loader = DataLoader(train, batch_size=batch_size, shuffle=True,  collate_fn=collate_batch) #num_workers to train faster
val_loader   = DataLoader(valid, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_loader  = DataLoader(test,  batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [31]:
for label, text in val_loader:
    break

In [32]:
label.shape #(batch_size, )

torch.Size([64])

In [33]:
text.shape #(batch_size, seq len)

torch.Size([64, 36])

## 4. Model (CNN)

In [34]:
class CNN(nn.Module): #more elegant version
    def __init__(self, input_dim, emb_dim, output_dim, dropout, n_filters, filter_sizes):
        
        super().__init__()
                
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, emb_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        embedded = self.embedding(text)
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))
        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

## 5. Train

In [35]:
#explicitly initialize weights for better learning
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, (nn.Conv2d, nn.Conv2d)):
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.zeros_(param)
            elif 'weight' in name:
                nn.init.kaiming_normal_(param) 

In [36]:
def accuracy(preds, y):
    
    predicted = torch.max(preds.data, 1)[1]
    batch_corr = (predicted == y).sum()
    acc = batch_corr / len(y)
    
    return acc

In [37]:
def train(model, loader, optimizer, criterion, loader_length):
    epoch_loss = 0
    epoch_acc = 0
    model.train() #useful for batchnorm and dropout
    
    for i, (label, text) in enumerate(loader): 
        label = label.to(device) #(batch_size, )
        text = text.to(device) #(batch_size, seq len)
                
        #predict
        predictions = model(text).squeeze(1) #output by the fc is (batch_size, 1), thus need to remove this 1
        
        #calculate loss
        loss = criterion(predictions, label)
        acc  = accuracy(predictions, label)
        
        #backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
                        
    return epoch_loss / loader_length, epoch_acc / loader_length

In [38]:
def evaluate(model, loader, criterion, loader_length):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for i, (label, text) in enumerate(loader): 
            label = label.to(device) #(batch_size, )
            text  = text.to(device)  #(seq len, batch_size)

            predictions = model(text).squeeze(1) 
            
            loss = criterion(predictions, label)
            acc  = accuracy(predictions, label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / loader_length, epoch_acc / loader_length

### Actual training

In [39]:
train_loader_length = len(list(iter(train_loader)))
val_loader_length   = len(list(iter(val_loader)))
test_loader_length  = len(list(iter(test_loader)))

In [40]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [41]:
#experiment tracking
import mlflow
from mlflow.models import infer_signature
import os

# This the dockerized method.
# We build two docker containers, one for python/jupyter and another for mlflow.
# The url `mlflow` is resolved into another container within the same composer.
mlflow.set_tracking_uri("http://mlflow:5000")
# In the dockerized way, the user who runs this code will be `root`.
# The MLflow will also log the run user_id as `root`.
# To change that, we need to set this environ["LOGNAME"] to your name.
os.environ["LOGNAME"] = "noppawee"
#mlflow.create_experiment(name="noppawee-ML-project")  #create if you haven't create
mlflow.set_experiment(experiment_name="CNN_question_classification")



* 'schema_extra' has been renamed to 'json_schema_extra'


<Experiment: artifact_location='mlflow-artifacts:/317974692446439529', creation_time=1713187477885, experiment_id='317974692446439529', last_update_time=1713187477885, lifecycle_stage='active', name='CNN_question_classification', tags={}>

In [42]:
import time
import torch.optim as optim

num_epochs = [10,20,30]
n_filters = [50,100,150]


for num_epoch in num_epochs:
    for n_filter in n_filters:
        
        input_dim  = len(vocab)
        emb_dim    = 300
        output_dim = 2 #2 classes

        #for cnn
        dropout = 0.5
        filter_sizes = [3, 4, 5]

        params={"model":"CNN", "num_epochs":num_epoch, "n_filters":n_filter, "filter_sizes":filter_sizes, "input_dim":input_dim, "emb_dim":emb_dim, "output_dim":output_dim, "dropout":0.5}
        mlflow.start_run(run_name=f"CNN100-{params['num_epochs']}-epochs-{params['n_filters']}-n_filters")
        mlflow.log_params(params)

        print("="*5, f"CNN with {params['num_epochs']}-epochs-{params['n_filters']}-n_filters","="*5)

        model = CNN(input_dim, emb_dim, output_dim, dropout, n_filter, filter_sizes).to(device)
        model.apply(initialize_weights)
        model.embedding.weight.data = fast_embedding

        lr=0.05
        #training hyperparameters
        optimizer = optim.SGD(model.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss()

        train_losses, train_accs, val_losses, val_accs = [],[],[],[]
        best_valid_loss = float('inf')

        for epoch in range(num_epoch):
            start_time = time.time()
                
            train_loss, train_acc = train(model, train_loader, optimizer, criterion, train_loader_length)
            valid_loss, valid_acc = evaluate(model, val_loader, criterion, val_loader_length)
                
            #for plotting
            train_losses.append(train_loss)
            train_accs.append(train_acc)
            val_losses.append(valid_loss)
            val_accs.append(valid_acc)
                
            end_time = time.time()
                
            epoch_mins, epoch_secs = epoch_time(start_time, end_time)
            mlflow.log_metric(key="train_loss", value=train_loss, step=epoch)
            mlflow.log_metric(key="train_acc", value=train_acc, step=epoch)
            mlflow.log_metric(key="val_loss", value=valid_loss, step=epoch)
            mlflow.log_metric(key="val_acc", value=valid_acc, step=epoch)
                
            
            #early stopping
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                mlflow.pytorch.log_model(model, "model")
                
            print(f'Epoch: {epoch+1} | Time: {epoch_mins}m {epoch_secs}s')
            print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
            print(f'\tVal.  Loss: {valid_loss:.3f} | Val Acc: {valid_acc*100:.2f}%')
        mlflow.log_metric(key="min_val_loss", value=min(val_losses), step=epoch)    
        mlflow.end_run()
        

===== CNN with 10-epochs-50-n_filters =====
Epoch: 1 | Time: 0m 1s
	Train Loss: 0.778 | Train Acc: 49.49%
	Val.  Loss: 0.660 | Val Acc: 58.71%




Epoch: 2 | Time: 0m 1s
	Train Loss: 0.692 | Train Acc: 59.35%
	Val.  Loss: 0.624 | Val Acc: 68.82%
Epoch: 3 | Time: 0m 1s
	Train Loss: 0.652 | Train Acc: 62.48%
	Val.  Loss: 0.604 | Val Acc: 73.27%
Epoch: 4 | Time: 0m 1s
	Train Loss: 0.624 | Train Acc: 64.60%
	Val.  Loss: 0.582 | Val Acc: 73.56%
Epoch: 5 | Time: 0m 1s
	Train Loss: 0.564 | Train Acc: 72.28%
	Val.  Loss: 0.553 | Val Acc: 76.68%
Epoch: 6 | Time: 0m 1s
	Train Loss: 0.539 | Train Acc: 73.73%
	Val.  Loss: 0.524 | Val Acc: 77.99%
Epoch: 7 | Time: 0m 1s
	Train Loss: 0.508 | Train Acc: 76.61%
	Val.  Loss: 0.491 | Val Acc: 78.92%
Epoch: 8 | Time: 0m 1s
	Train Loss: 0.470 | Train Acc: 79.22%
	Val.  Loss: 0.479 | Val Acc: 77.36%
Epoch: 9 | Time: 0m 0s
	Train Loss: 0.445 | Train Acc: 80.08%
	Val.  Loss: 0.437 | Val Acc: 82.13%
Epoch: 10 | Time: 0m 1s
	Train Loss: 0.411 | Train Acc: 83.17%
	Val.  Loss: 0.416 | Val Acc: 81.14%
===== CNN with 10-epochs-100-n_filters =====
Epoch: 1 | Time: 0m 1s
	Train Loss: 0.795 | Train Acc: 52.30%
	