In [None]:
!pip install sentencepiece
!pip install transformers[torch]



In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from sklearn.model_selection import train_test_split
from transformers import AutoModel, AutoTokenizer

In [None]:
from google.colab import files
uploaded = files.upload()

Saving prompts_test.csv to prompts_test (1).csv
Saving prompts_train.csv to prompts_train (1).csv
Saving sample_submission.csv to sample_submission (1).csv
Saving summaries_test.csv to summaries_test (1).csv
Saving summaries_train.csv to summaries_train (1).csv


In [None]:
# load the required files
train_prompts = pd.read_csv('prompts_train.csv')
train_summaries = pd.read_csv('summaries_train.csv')

# merging the two dataframes to get every data for each text
df = pd.merge(train_prompts, train_summaries, on="prompt_id")

# cleaning and removing unnecessary strings in prompt text
new_prompt_text = []

df

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.594710
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886
...,...,...,...,...,...,...,...,...
7160,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff37545b2805,"In paragraph two, they would use pickle meat a...",1.520355,-0.292990
7161,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff4ed38ef099,"in the first paragraph it says ""either can it...",-1.204574,-1.169784
7162,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff53b94f7ce0,They would have piles of filthy meat on the fl...,0.328739,-1.053294
7163,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",ff7c7e70df07,They used all sorts of chemical concoctions to...,0.205683,0.380538


The code here is to create a dataset class for our raw data.

#Classes

In [1]:
class DataSet(Dataset):

    def __init__(self, text, targets, tokenizer):
        self.text = text
        self.targets = targets
        self.tokenizer = tokenizer
        self.sep_token = self.tokenizer.sep_token if self.tokenizer.sep_token else '[SEP]'

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = self.text[idx]
        target = self.targets[idx]

        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=512)

        inputs = {
            'input_ids': torch.tensor(encoding['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(encoding['attention_mask'], dtype=torch.long),
            'targets': torch.tensor(target, dtype=torch.float)
        }

        return inputs




NameError: name 'Dataset' is not defined

In [None]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [None]:
class Model(nn.Module):
    def __init__(self, model_name, num_layers_to_freeze):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)

        for i in range(num_layers_to_freeze):
            for param in self.model.encoder.layer[i].parameters():
                param.requires_grad = False
        self.pooler = MeanPooling()
        self.dropout = nn.Dropout(p = 0.2)
        self.reduce_dim = nn.Linear(self.model.config.hidden_size, 64)
        self.linear = nn.Linear(64, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids, attention_mask, return_dict=True)
        output = self.pooler(outputs['last_hidden_state'], attention_mask)
        outputs = self.dropout(output)
        outputs = self.reduce_dim(output)
        logits = self.linear(outputs)
        return logits


In [None]:
# define the metrics
def MCRMSE(targets, outputs):
    colwise_mse = torch.mean((targets - outputs) ** 2, axis=0)
    return torch.mean(torch.sqrt(colwise_mse))

In [None]:
# Train val split

from sklearn.model_selection import train_test_split

text_train, text_val, y_train, y_val = train_test_split(
  df['text'], df[['content', 'wording']],
  test_size=0.2,
  random_state=42
)
text_train = text_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
text_val = text_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)


text_train

0       The upper class consisted of the royal family,...
1       Egyptian society  was structured like a primid...
2       It happened so fastly because the orig nail ki...
3       As stated in the article the factory would use...
4       The experiment was ended because he thought th...
                              ...                        
5727    Differant than social classes were involved in...
5728    They would "rub it up with soda to take away t...
5729    The factory would mix the spoiled meat into th...
5730    They would mix it between thousands of other m...
5731    Aristotle believes that a tragedy should conta...
Name: text, Length: 5732, dtype: object

In [None]:
text_val

0       A n ideal tragedy should have actions that inc...
1           3 elements of an ideal tragedy, as describ...
2       In order to cover up spoiled meat, they would ...
3       The factory would use various methods to hide ...
4       In the novle "the jungle" by Upton Sinclair he...
                              ...                        
1428    The social stucture was built like a pyramid  ...
1429    the experiment developed so fast because the s...
1430    Three elements of an ideal tragedy is that it ...
1431    Egypt's system of government had a social stru...
1432    The structure of the ancient Egyption govermen...
Name: text, Length: 1433, dtype: object

# Optional: Download train/validation set

In [None]:
# Save to CSV
text_train.to_csv('text_train.csv')
y_train.to_csv('y_train.csv')
text_val.to_csv('text_val.csv')
y_val.to_csv('y_val.csv')

In [None]:
# Load Series
text_train = pd.read_csv('text_train.csv', index_col=0).squeeze()
y_train = pd.read_csv('y_train.csv', index_col=0).squeeze()
text_val = pd.read_csv('text_val.csv', index_col=0).squeeze()
y_val = pd.read_csv('y_val.csv', index_col=0).squeeze()

text_train

0       The upper class consisted of the royal family,...
1       Egyptian society  was structured like a primid...
2       It happened so fastly because the orig nail ki...
3       As stated in the article the factory would use...
4       The experiment was ended because he thought th...
                              ...                        
5727    Differant than social classes were involved in...
5728    They would "rub it up with soda to take away t...
5729    The factory would mix the spoiled meat into th...
5730    They would mix it between thousands of other m...
5731    Aristotle believes that a tragedy should conta...
Name: text, Length: 5732, dtype: object

In [None]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

class Trainer:
    def __init__(self, model, training_epochs, model_name, criterion, device, text_train, y_train, text_val, y_val, patience=2, test_size=0.2, batch_size=4):
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        train_dataset = DataSet(text_train, y_train.values, tokenizer)
        self.train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_dataset = DataSet(text_val, y_val.values, tokenizer)
        self.val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        self.model = model.to(device)
        self.training_epochs = training_epochs
        self.optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay = 0.01) #test
        self.loss_function = criterion
        self.device = device
        self.patience = patience
        self.best_val_loss = float('inf')
        self.early_stopping_counter = 0

    def train(self):
        self.model.to(self.device)
        for epoch in range(self.training_epochs):
            # Training
            self.model.train()
            train_loss = 0
            pbar = tqdm(self.train_loader, desc=f'Epoch {epoch}')
            for batch in pbar:
                self.optimizer.zero_grad()
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                target = batch['targets'].to(device)
                outputs = self.model(input_ids, attention_mask)
                loss = self.loss_function(outputs, target)
                loss.backward()
                self.optimizer.step()
                train_loss += loss.item()
                pbar.set_postfix({'Training Loss': train_loss/len(self.train_loader)})

            # Validation
            self.model.eval()
            val_loss = 0
            with torch.no_grad():
                for batch in self.val_loader:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    target = batch['targets'].to(device)
                    outputs = self.model(input_ids, attention_mask)
                    loss = self.loss_function(outputs, target)
                    val_loss += loss.item()

            val_loss = val_loss/len(self.val_loader)

            print(f'Epoch: {epoch}, Train Loss: {train_loss/len(self.train_loader)}, Val Loss: {val_loss}')

            # Early stopping
            if val_loss < self.best_val_loss:
                self.best_val_loss = val_loss
                self.early_stopping_counter = 0
                torch.save(self.model.state_dict(), 'best_model.pt')
            else:
                self.early_stopping_counter += 1
                if self.early_stopping_counter >= self.patience:
                    print('Early Stopping')
                    break
        self.model.load_state_dict(torch.load('best_model.pt'))
        return self.model

# CV train (prototype)

In [None]:
def cv_train(model_name, text_data, target_data, k=4):
    kf = KFold(n_splits=k, random_state = 42, shuffle = True)
    val_losses = []
    all_state_dicts = []

    for fold, (train_index, val_index) in enumerate(kf.split(text_data, target_data)):
        print(f"Training fold {fold}")

        model = Model(model_name, 12)

        text_train_fold = text_data.iloc[train_index].reset_index(drop=True)
        y_train_fold = target_data.iloc[train_index].reset_index(drop=True)
        text_val_fold = text_data.iloc[val_index].reset_index(drop=True)
        y_val_fold = target_data.iloc[val_index].reset_index(drop=True)

        trainer = Trainer(model, training_epochs=5, model_name= model_name, criterion=MCRMSE, device=device, text_train=text_train_fold, y_train=y_train_fold, text_val=text_val_fold, y_val=y_val_fold)
        trainer.train()

        val_losses.append(trainer.best_val_loss)

        all_state_dicts.append(copy.deepcopy(trainer.model.state_dict()))

    averaged_state_dict = {}
    for key in all_state_dicts[0]:
        averaged_state_dict[key] = torch.stack([state_dict[key] for state_dict in all_state_dicts], dim=0).mean(dim=0)

    model.load_state_dict(averaged_state_dict)

    avg_val_loss = sum(val_losses) / k
    print(f"Average CV Loss: {avg_val_loss}")

    return model


# Training basic model(s)



In [None]:
# Models to train
#"microsoft/deberta-v3-large", "microsoft/deberta-v3-base", "roberta-base", "microsoft/deberta-base", "bert-base-uncased"
# Define your model names here
#"t5-base", "albert-base-v2"
model_names = ["microsoft/deberta-v3-large"]

In [None]:
from tqdm.auto import tqdm
from transformers import AdamW
import torch
from sklearn.model_selection import KFold
import copy


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = MCRMSE
num_epochs = 10

trained_models = []
# Loop through the model names and train each one
#trained_models = []
i = 0

for model_name in model_names:
    # Instantiate and train the model
    i += 1
    # trainer = Trainer(model, num_epochs, model_name, MCRMSE, device, text_train, y_train, text_val, y_val)
    model = cv_train(model_name, df['text'], df[['content', 'wording']])
    torch.save(model.state_dict(), f'model_{i}.pt')
    # files.download(f'model_{i}.pt')
    trained_models.append(model)
    torch.cuda.empty_cache()


Training fold 0


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch 0:   0%|          | 0/1344 [00:00<?, ?it/s]

Epoch: 0, Train Loss: 0.5045117975511987, Val Loss: 0.44810311113750295


Epoch 1:   0%|          | 0/1344 [00:00<?, ?it/s]

Epoch: 1, Train Loss: 0.4147108717976759, Val Loss: 0.48508637292044504


Epoch 2:   0%|          | 0/1344 [00:00<?, ?it/s]

Epoch: 2, Train Loss: 0.3612511715064535, Val Loss: 0.47293350918750676
Early Stopping
[0.44810311113750295]
Training fold 1


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch 0:   0%|          | 0/1344 [00:00<?, ?it/s]

Epoch: 0, Train Loss: 0.5078388832043856, Val Loss: 0.43851880895506057


Epoch 1:   0%|          | 0/1344 [00:00<?, ?it/s]

Epoch: 1, Train Loss: 0.41931737031388494, Val Loss: 0.45760641027508037


Epoch 2:   0%|          | 0/1344 [00:00<?, ?it/s]

Epoch: 2, Train Loss: 0.363716356600413, Val Loss: 0.4312704344213541


Epoch 3:   0%|          | 0/1344 [00:00<?, ?it/s]

Epoch: 3, Train Loss: 0.30796192325319033, Val Loss: 0.4404627265342112


Epoch 4:   0%|          | 0/1344 [00:00<?, ?it/s]

Epoch: 4, Train Loss: 0.26879469938908834, Val Loss: 0.47018408801938805
Early Stopping
[0.44810311113750295, 0.4312704344213541]
Training fold 2


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Epoch 0:   0%|          | 0/1344 [00:00<?, ?it/s]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
save_path = "/content/drive/My Drive/"  # change "your_folder_name" to your desired folder

for i, model in enumerate(trained_models):
    torch.save(model.state_dict(), f"{save_path}model_{i}.pth")


# Optional: Save pretrained model and tokenizer


In [None]:
# Save the models and the tokenizers
save_path = "/content/drive/My Drive/pretrained_models/"
for model_name in model_names:
    model = AutoModel.from_pretrained(model_name)
    model_path_on_drive = f"{save_path}{model_name.replace('/', '_')}_model"

    model.save_pretrained(model_path_on_drive)

In [None]:
trained_models[4]

# Ensemble - Weighted average


In [None]:
pip install optuna

In [None]:
def generate_predictions(models, model_names, text_val, y_val, device):
    predictions = []
    for model, model_name in zip(models, model_names):
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        val_dataset = DataSet(text_val, y_val.values, tokenizer)
        val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

        preds = []
        model.to(device)
        model.eval()
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                output = model(input_ids, attention_mask)
                preds.append(output.cpu())
        predictions.append(torch.cat(preds).numpy())

    return predictions

In [None]:
import optuna

def objective(trial):
    content_weights = [trial.suggest_float(f'content_weight{i}', -0.5, 2) for i in range(len(trained_models))]
    wording_weights = [trial.suggest_float(f'wording_weight{i}', -0.5, 2) for i in range(len(trained_models))]

    pred_content = np.mean([predictions[i][:, 0] * content_weights[i] for i in range(len(trained_models))], axis=0)
    pred_wording = np.mean([predictions[i][:, 1] * wording_weights[i] for i in range(len(trained_models))], axis=0)

    combined_preds = np.vstack((pred_content, pred_wording)).T
    combined_preds = torch.tensor(combined_preds).to(device)

    score = MCRMSE(torch.tensor(y_val.values).to(device), combined_preds)
    return score.item()


In [None]:
predictions = generate_predictions(trained_models, model_names, text_val, y_val, device)
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print('Number of finished trials: ', len(study.trials))
print('Best trial:')
trial = study.best_trial

print('Value: ', trial.value)
print('Params: ')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

# Optional: Save the ensemble

In [None]:
best_weights_content = [trial.params[f'content_weight{i}'] for i in range(len(trained_models))]
best_weights_wording = [trial.params[f'wording_weight{i}'] for i in range(len(trained_models))]


np.save('best_weights_content.npy', best_weights_content)
np.save('best_weights_wording.npy', best_weights_wording)


#Ensemble - stacking

- Still working, ignore the cell below

In [None]:
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin, clone
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold

class stacking(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, model_names, meta_model, models_directory):
        self.model_names = model_names
        self.meta_model = meta_model
        self.models_directory = models_directory
        self.kf = KFold(n_splits=4, random_state=42, shuffle=True)

    def fit(self, X, y):
        self.saved_models = []
        oof_train_wording = np.zeros((X.shape[0], len(self.model_names)))
        oof_train_content = np.zeros((X.shape[0], len(self.model_names)))

        for i, model_name in enumerate(self.model_names):
            print(f"Training model: {model_name}")
            model_list = []
            cv_scores = []
            for train_index, val_index in self.kf.split(X,y):
                print(train_index, val_index)
                model = Model(model_name).to(device)
                model_list.append(model)
                # Prepare the data
                text_train, text_val = X['text'].iloc[train_index].reset_index(drop=True), X['text'].iloc[val_index].reset_index(drop=True)
                prompt_train, prompt_val = X['prompt_question'].iloc[train_index].reset_index(drop=True), X['prompt_question'].iloc[val_index].reset_index(drop=True)
                y_train, y_val = y.iloc[train_index].reset_index(drop=True), y.iloc[val_index].reset_index(drop=True)
                # Create DataLoaders
                tokenizer = AutoTokenizer.from_pretrained(model_name)
                train_dataset = DataSet(text_train, prompt_train, y_train.values, tokenizer)
                train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
                val_dataset = DataSet(text_val, prompt_val, y_val.values, tokenizer)
                val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)
                optimizer = AdamW(model.parameters(), lr=1e-5)
                criterion = MCRMSE
                num_epochs = 8
                # Train the model
                trained_model = train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device)
                model_list[-1] = trained_model
                predictions = trained_model.predict(val_loader)
                oof_train_wording[val_index, i] = predictions[:, 0]
                oof_train_content[val_index, i] = predictions[:, 1]
                cv_scores.append(criterion(predictions, y_val).item())
            self.saved_models.append(model_list)

        oof_train = np.hstack((oof_train_wording, oof_train_content))
        self.meta_model.fit(oof_train, y)
        return self

    def predict(self, X):
        final_predictions_wording = []
        final_predictions_content = []

        for i, model_list in enumerate(self.saved_models):
            model_name = self.model_names[i]
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            test_dataset = DataSet(X['text'], X['prompt_question'], np.zeros(len(X)), tokenizer)
            test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)
            model_predictions = np.column_stack([model.predict(test_loader).numpy() for model in model_list])
            final_predictions_wording.append(model_predictions[:, 0])
            final_predictions_content.append(model_predictions[:, 1])

        print(final_predictions_wording, final_predictions_content)

        meta_features = np.hstack((np.column_stack(final_predictions_wording), np.column_stack(final_predictions_content)))
        return self.meta_model.predict(meta_features)
