<a href="https://colab.research.google.com/github/Tomertech/BiLSTMDependencyParser/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import torch
import pickle
import os

In [33]:
is_colab=True
pres_of_1=0.104571953

# Training loop
num_epochs = 10
batch_size = 32

# Model
input_dim = 3  # number of input features
hidden_dim = 128  # number of hidden units in LSTM layer
output_dim = 1  # number of output classes
num_layers = 2  # number of LSTM layers

In [4]:
if is_colab:
    from google.colab import drive
    drive.mount('/content/gdrive')
    GDRIVE_DIR = '/content/gdrive/MyDrive/Technion/Cognition/project'
else:
    GDRIVE_DIR = './'

Mounted at /content/gdrive


In [5]:
def open_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def save_pickle(df, path):
    with open(path, 'wb') as f:
        pickle.dump(df, f)

In [6]:
def data_preprocessing(df):

    df['label'] = df['L1'].apply(lambda x: x == "English").astype(int)
    df['CURRENT_FIX_INTEREST_AREA_ID'] = df['CURRENT_FIX_INTEREST_AREA_ID'].drop(
        df[df['CURRENT_FIX_INTEREST_AREA_ID'] == '.'].index)
    df = df.dropna(subset=['CURRENT_FIX_DURATION', 'CURRENT_FIX_INTEREST_AREA_ID', 'CURRENT_FIX_X'])
    df['CURRENT_FIX_INTEREST_AREA_ID'] = df['CURRENT_FIX_INTEREST_AREA_ID'].astype(int)

    # Normalize the data with z-score normalization
    df['CURRENT_FIX_DURATION'] = (df['CURRENT_FIX_DURATION'] - df['CURRENT_FIX_DURATION'].mean()) / df[
        'CURRENT_FIX_DURATION'].std()
    df['CURRENT_FIX_INTEREST_AREA_ID'] = (df['CURRENT_FIX_INTEREST_AREA_ID'] - df[
        'CURRENT_FIX_INTEREST_AREA_ID'].mean()) / df['CURRENT_FIX_INTEREST_AREA_ID'].std()
    df['CURRENT_FIX_X'] = (df['CURRENT_FIX_X'] - df['CURRENT_FIX_X'].mean()) / df['CURRENT_FIX_X'].std()

    sentences_gpby = df.groupby(['list', 'sentenceid'])
    sentences, ys = [], []

    for _, group in sentences_gpby:
        sentences.append(torch.tensor(group[['CURRENT_FIX_DURATION', 'CURRENT_FIX_INTEREST_AREA_ID', 'CURRENT_FIX_X']].values).float().squeeze(1))
        ys.append(torch.tensor(group['label'].iloc[0]).long())
    return sentences, ys

In [7]:
df = open_pickle(f'{GDRIVE_DIR}/data/features_base.pkl')

In [8]:
data_dir= f'{GDRIVE_DIR}/data/'
filename_s = data_dir+'sentences.pkl'
filename_y = data_dir+'ys.pkl'

# Check if the file exists
if os.path.exists(filename_s):
    # If the file exists, load the data
    with open(filename_s, 'rb') as f:
        sentences = pickle.load(f)
    with open(filename_y, 'rb') as f:
        ys = pickle.load(f)

else:
    # If the file doesn't exist, run your data preprocessing function
    sentences, ys = data_preprocessing(df)
    
    # And then save the data to a file for future use
    with open(filename_s, 'wb') as f:
        pickle.dump(sentences, f)
    with open(filename_y, 'wb') as f:
        pickle.dump(ys, f)

COLAB_NUM_OF_EXAMPLES = 10000
if is_colab:
    sentences, ys = sentences[:COLAB_NUM_OF_EXAMPLES], ys[:COLAB_NUM_OF_EXAMPLES]

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def collate_fn_padd(batch):

    ## get sequence lengths
    X, y = zip(*batch)
    X_lengths = torch.tensor([x.shape[0] for x in X]).to(device)

    ## padd
    X = torch.nn.utils.rnn.pad_sequence(X, batch_first=True).to(device)
    # y (labels) need to be (N,C) shape, stated here: https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html#crossentropyloss
    y = torch.stack(y).to(device)
    return X.float(), y.float(), X_lengths.long()

Using device: cuda:0


In [10]:
from torch.utils.data import WeightedRandomSampler

# weights = [pres_of_1,1-pres_of_1]
# class_weights = torch.FloatTensor(weights).to(device)
# criterion = torch.nn.BCEWithLogitsLoss(pos_weight=class_weights)

In [11]:
from torch.utils.data import TensorDataset, DataLoader, random_split

# Create a dataset from tensors directly
dataset = list(zip(sentences, torch.tensor(ys).unsqueeze(dim=1)))

# Define the split sizes (e.g., 70% train, 15% valid, 15% test)
train_size = int(0.7 * len(dataset))
valid_size = (len(dataset) - train_size) // 2
test_size = len(dataset) - train_size - valid_size

# Split dataset
train_data, valid_data, test_data = random_split(dataset, [train_size, valid_size, test_size])

# Create dataloaders
train_dataloader = DataLoader(train_data, batch_size=batch_size, collate_fn=collate_fn_padd)
valid_dataloader = DataLoader(valid_data, batch_size=batch_size, collate_fn=collate_fn_padd)
test_dataloader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate_fn_padd)

In [None]:
next(iter(train_dataloader))

In [23]:
import torch
import torch.nn as nn

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.fc_1 = nn.Linear(hidden_dim, hidden_dim)
        self.fc_2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(p=0.2)
        self.activation = nn.ReLU()

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = out[:, -1, :]  # we are taking the last output of the sequence
        out = self.dropout(out)
        out = self.fc_2(self.activation(self.fc_1(out)))
        out = torch.sigmoid(out)
        return out

Using device: cuda:0


In [37]:
model = LSTMClassifier(input_dim=input_dim, hidden_dim=hidden_dim,
                       output_dim=output_dim, num_layers=num_layers).to(device)

criterion = torch.nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-3)
model.parameters

<bound method Module.parameters of LSTMClassifier(
  (lstm): LSTM(3, 128, num_layers=2, batch_first=True)
  (fc_1): Linear(in_features=128, out_features=128, bias=True)
  (fc_2): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (activation): ReLU()
)>

In [35]:
def train_model(model, train_dataloader, val_dataloader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()  # Ensure the model is in training mode
        total_loss = 0.0
        for i, (inputs, labels, lengths) in enumerate(train_dataloader):
            # Forward pass
            predictions = model(inputs)
            predicted = (predictions > 0.5).float()
            # if predicted.sum(): print(f"iteration: {i}; predictions: {predicted.sum()}, labels: {labels.sum()}")
            # print(f"predictions.shape: {predictions.shape}")
            # print(f"inputs.shape: {inputs.shape}")
            loss = criterion(predictions, labels)
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()  # Accumulate the loss

        print(f"Epoch {epoch + 1}/{num_epochs}, Training Loss: {round(total_loss/(len(train_dataloader)), 3)}")

        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels, lengths in val_dataloader:
                predictions = model(inputs)
                loss = criterion(predictions, labels)

                val_loss += loss.item()

                # Adjusting this line to correctly threshold predictions
                predicted = (predictions > 0.5).float()
                # print(predicted)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_loss /= len(val_dataloader)
        val_accuracy = correct / total * 100.0

        print(f"Epoch {epoch + 1}/{num_epochs}, Validation Loss: {round(val_loss, 3)}, Validation Accuracy: {round(val_accuracy, 3)}%")

In [38]:
train_model(model, train_dataloader, valid_dataloader, criterion, optimizer, num_epochs=50)

Epoch 1/50, Training Loss: 0.489
Epoch 1/50, Validation Loss: 0.485, Validation Accuracy: 81.216%
Epoch 2/50, Training Loss: 0.487
Epoch 2/50, Validation Loss: 0.484, Validation Accuracy: 81.216%
Epoch 3/50, Training Loss: 0.485
Epoch 3/50, Validation Loss: 0.485, Validation Accuracy: 81.216%
Epoch 4/50, Training Loss: 0.484
Epoch 4/50, Validation Loss: 0.483, Validation Accuracy: 81.216%
Epoch 5/50, Training Loss: 0.467
Epoch 5/50, Validation Loss: 0.399, Validation Accuracy: 79.389%
Epoch 6/50, Training Loss: 0.359
Epoch 6/50, Validation Loss: 0.345, Validation Accuracy: 84.12%
Epoch 7/50, Training Loss: 0.336
Epoch 7/50, Validation Loss: 0.339, Validation Accuracy: 84.214%
Epoch 8/50, Training Loss: 0.327
Epoch 8/50, Validation Loss: 0.327, Validation Accuracy: 85.127%
Epoch 9/50, Training Loss: 0.321
Epoch 9/50, Validation Loss: 0.326, Validation Accuracy: 85.338%
Epoch 10/50, Training Loss: 0.314
Epoch 10/50, Validation Loss: 0.326, Validation Accuracy: 85.455%
Epoch 11/50, Traini