In [1]:
import os
import time
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Directory containing your CSV files
data_dir = "/mnt/c/Users/user/DD_Pipeline/Training_SMILES"

# Ensure the log directory exists
log_dir = './training_logs'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set the eos_token as padding token

# Load the model
model = GPT2LMHeadModel.from_pretrained("gpt2")

def load_and_tokenize_chunk(file_path):
    dataset = load_dataset('csv', data_files=file_path)

    def tokenize_function(examples):
        tokenized = tokenizer(examples['SMILES'], truncation=True, padding='max_length', max_length=300)
        tokenized['labels'] = tokenized['input_ids'].copy()  # Set labels for the LM
        return tokenized

    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    split_dataset = tokenized_dataset['train'].train_test_split(test_size=0.15)
    return split_dataset

def log_losses(log_file_path, train_loss, val_loss=None):
    with open(log_file_path, 'a') as log_file:
        log_file.write(f"Training loss: {train_loss}\n")
        if val_loss:
            log_file.write(f"Validation loss: {val_loss}\n")

csv_files = sorted([os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.csv')])

for i, file_path in enumerate(csv_files):
    print(f"Processing file: {file_path}")

    split_dataset = load_and_tokenize_chunk(file_path)

    training_args = TrainingArguments(
        output_dir="./smiles_gpt2",
        evaluation_strategy="epoch",
        learning_rate=5e-5,
        weight_decay=0.01,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        save_total_limit=2,
        resume_from_checkpoint=True if i > 0 else None,
        logging_dir="./logs",
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=split_dataset['train'],
        eval_dataset=split_dataset['test'],
    )

    train_output = trainer.train(resume_from_checkpoint=True if i > 0 else None)
    checkpoint_dir = f'./smiles_gpt2/checkpoint-{i+1}'
    model.save_pretrained(checkpoint_dir)
    tokenizer.save_pretrained(checkpoint_dir)

    train_loss = train_output.training_loss
    log_file_path = f'training_logs/chunk_{i+1}_training_log.txt'
    log_losses(log_file_path, train_loss)

    print(f"Training completed on file {file_path}. Checkpoint saved at {checkpoint_dir}.")
    print("Waiting for 5 minutes before training on the next chunk...")
    time.sleep(300)  # 5 minutes

print("Training completed on all files!")

  from .autonotebook import tqdm as notebook_tqdm
2024-08-30 12:52:14.726685: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-30 12:52:16.921965: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-30 12:52:17.716347: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-30 12:52:17.931484: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-30 12:52:19.4

Processing file: /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk1.csv




Epoch,Training Loss,Validation Loss
1,0.1021,0.092291
2,0.0935,0.081216
3,0.0867,0.077186


Training completed on file /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk1.csv. Checkpoint saved at ./smiles_gpt2/checkpoint-1.
Waiting for 5 minutes before training on the next chunk...
Processing file: /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk10.csv


Generating train split: 62616 examples [00:00, 457687.22 examples/s]
Map: 100%|██████████| 62616/62616 [00:05<00:00, 11737.23 examples/s]
There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)


Epoch,Training Loss,Validation Loss


Training completed on file /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk10.csv. Checkpoint saved at ./smiles_gpt2/checkpoint-2.
Waiting for 5 minutes before training on the next chunk...


KeyboardInterrupt: 

In [1]:
import os
import time
import warnings
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Suppress all warnings
warnings.filterwarnings('ignore')

def get_latest_checkpoint(base_dir):
    """ Retrieve the most recent checkpoint from the base directory. """
    try:
        checkpoints = [os.path.join(base_dir, d) for d in os.listdir(base_dir) if d.startswith('checkpoint-')]
        if checkpoints:
            latest_checkpoint = max(checkpoints, key=os.path.getmtime)
            print(f"Latest checkpoint found: {latest_checkpoint}")
            return latest_checkpoint
        else:
            print("No checkpoints found, will start from base model.")
            return None
    except Exception as e:
        print(f"Error finding latest checkpoint: {e}")
        return None

# Directory containing your CSV files
data_dir = "/mnt/c/Users/user/DD_Pipeline/Training_SMILES"
# Base directory where checkpoints are saved
base_checkpoint_dir = "/mnt/c/Users/user/DD_Pipeline/model_training/smiles_gpt2"

# Ensure the log directory exists
log_dir = './training_logs'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set the eos_token as padding token

def load_and_tokenize_chunk(file_path):
    try:
        dataset = load_dataset('csv', data_files=file_path)
        def tokenize_function(examples):
            tokenized = tokenizer(examples['SMILES'], truncation=True, padding='max_length', max_length=500)
            tokenized['labels'] = tokenized['input_ids'].copy()
            return tokenized
        tokenized_dataset = dataset.map(tokenize_function, batched=True)
        split_dataset = tokenized_dataset['train'].train_test_split(test_size=0.15)
        return split_dataset
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

def log_losses(log_file_path, train_loss, val_loss=None):
    with open(log_file_path, 'a') as log_file:
        log_file.write(f"Training loss: {train_loss}\n")
        if val_loss:
            log_file.write(f"Validation loss: {val_loss}\n")

# Get all CSV files and sort them in the correct numerical order of chunks
try:
    csv_files = sorted([os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.csv')],
                       key=lambda x: int(x.split('chunk')[-1].split('.')[0]))
except Exception as e:
    print(f"Error sorting CSV files: {e}")
    csv_files = []

# Start training from chunk1.csv
for i, file_path in enumerate(csv_files, start=1):
    print(f"Processing file: {file_path}")
    split_dataset = load_and_tokenize_chunk(file_path)
    if split_dataset is None:
        continue

    # Dynamically load the latest checkpoint for each new training session
    latest_checkpoint_path = get_latest_checkpoint(base_checkpoint_dir)
    if latest_checkpoint_path:
        model = GPT2LMHeadModel.from_pretrained(latest_checkpoint_path)
    else:
        model = GPT2LMHeadModel.from_pretrained("gpt2")

    training_args = TrainingArguments(
        output_dir=base_checkpoint_dir,
        eval_strategy="epoch",
        learning_rate=5e-5,
        weight_decay=0.01,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        save_total_limit=2,
        resume_from_checkpoint=latest_checkpoint_path,
        logging_dir="./logs",
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=split_dataset['train'],
        eval_dataset=split_dataset['test'],
    )

    try:
        train_output = trainer.train()
        new_checkpoint_dir = os.path.join(base_checkpoint_dir, f"checkpoint-{i}")
        model.save_pretrained(new_checkpoint_dir)
        train_loss = train_output.training_loss
        log_file_path = os.path.join(log_dir, f"chunk_{i}_training_log.txt")
        log_losses(log_file_path, train_loss)
        print(f"Training completed on file {file_path}. Checkpoint saved at {new_checkpoint_dir}.")
    except Exception as e:
        print(f"Training failed for {file_path} due to {e}")
    finally:
        print("Waiting for 30 minutes before training on the next chunk...")
        time.sleep(1800)  # 10 minutes pause

print("Training completed on all files!")

  from .autonotebook import tqdm as notebook_tqdm
2024-09-07 16:56:19.503825: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-07 16:56:22.078055: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-07 16:56:22.921311: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-07 16:56:23.143921: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-07 16:56:24.8

Processing file: /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk1.csv
Error finding latest checkpoint: [Errno 2] No such file or directory: '/mnt/c/Users/user/DD_Pipeline/model_training/smiles_gpt2'


Epoch,Training Loss,Validation Loss
1,0.0583,0.056311
2,0.0564,0.050634


Training completed on file /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk1.csv. Checkpoint saved at /mnt/c/Users/user/DD_Pipeline/model_training/smiles_gpt2/checkpoint-1.
Waiting for 30 minutes before training on the next chunk...
Processing file: /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk2.csv


Map: 100%|██████████| 82334/82334 [00:08<00:00, 9986.70 examples/s] 


Latest checkpoint found: /mnt/c/Users/user/DD_Pipeline/model_training/smiles_gpt2/checkpoint-1


Epoch,Training Loss,Validation Loss
1,0.0501,0.046646
2,0.0475,0.043779


Training completed on file /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk2.csv. Checkpoint saved at /mnt/c/Users/user/DD_Pipeline/model_training/smiles_gpt2/checkpoint-2.
Waiting for 30 minutes before training on the next chunk...
Processing file: /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk3.csv


Map: 100%|██████████| 81814/81814 [00:08<00:00, 10176.26 examples/s]


Latest checkpoint found: /mnt/c/Users/user/DD_Pipeline/model_training/smiles_gpt2/checkpoint-2


Epoch,Training Loss,Validation Loss
1,0.0465,0.043369
2,0.0451,0.041348


Training completed on file /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk3.csv. Checkpoint saved at /mnt/c/Users/user/DD_Pipeline/model_training/smiles_gpt2/checkpoint-3.
Waiting for 30 minutes before training on the next chunk...
Processing file: /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk4.csv


Map: 100%|██████████| 81600/81600 [00:08<00:00, 10053.19 examples/s]


Latest checkpoint found: /mnt/c/Users/user/DD_Pipeline/model_training/smiles_gpt2/checkpoint-3


Epoch,Training Loss,Validation Loss
1,0.0432,0.040992
2,0.0444,0.039241


Training completed on file /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk4.csv. Checkpoint saved at /mnt/c/Users/user/DD_Pipeline/model_training/smiles_gpt2/checkpoint-4.
Waiting for 30 minutes before training on the next chunk...
Processing file: /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk5.csv


Map: 100%|██████████| 76002/76002 [00:07<00:00, 10097.37 examples/s]


Latest checkpoint found: /mnt/c/Users/user/DD_Pipeline/model_training/smiles_gpt2/checkpoint-4


Epoch,Training Loss,Validation Loss
1,0.047,0.043046
2,0.043,0.041002


Training completed on file /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk5.csv. Checkpoint saved at /mnt/c/Users/user/DD_Pipeline/model_training/smiles_gpt2/checkpoint-5.
Waiting for 30 minutes before training on the next chunk...
Processing file: /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk6.csv


Map: 100%|██████████| 68798/68798 [00:06<00:00, 9971.07 examples/s] 


Latest checkpoint found: /mnt/c/Users/user/DD_Pipeline/model_training/smiles_gpt2/checkpoint-5


Epoch,Training Loss,Validation Loss
1,0.0452,0.044249
2,0.0422,0.041948


Training completed on file /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk6.csv. Checkpoint saved at /mnt/c/Users/user/DD_Pipeline/model_training/smiles_gpt2/checkpoint-6.
Waiting for 30 minutes before training on the next chunk...
Processing file: /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk7.csv


Map: 100%|██████████| 73072/73072 [00:07<00:00, 10071.21 examples/s]


Latest checkpoint found: /mnt/c/Users/user/DD_Pipeline/model_training/smiles_gpt2/checkpoint-6


Epoch,Training Loss,Validation Loss
1,0.0418,0.041603
2,0.0406,0.039574


Training completed on file /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk7.csv. Checkpoint saved at /mnt/c/Users/user/DD_Pipeline/model_training/smiles_gpt2/checkpoint-7.
Waiting for 30 minutes before training on the next chunk...
Processing file: /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk8.csv


Map: 100%|██████████| 65903/65903 [00:06<00:00, 9852.18 examples/s] 


Latest checkpoint found: /mnt/c/Users/user/DD_Pipeline/model_training/smiles_gpt2/checkpoint-7


In [1]:
import os
import time
import warnings
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Suppress all warnings
warnings.filterwarnings('ignore')

def get_latest_checkpoint(base_dir):
    """ Retrieve the most recent checkpoint from the base directory. """
    try:
        checkpoints = [os.path.join(base_dir, d) for d in os.listdir(base_dir) if d.startswith('checkpoint-')]
        if checkpoints:
            latest_checkpoint = max(checkpoints, key=os.path.getmtime)
            print(f"Latest checkpoint found: {latest_checkpoint}")
            return latest_checkpoint
        else:
            print("No checkpoints found, will start from base model.")
            return None
    except Exception as e:
        print(f"Error finding latest checkpoint: {e}")
        return None

# Directory containing your CSV files
data_dir = "/mnt/c/Users/user/DD_Pipeline/Training_SMILES"
# Base directory where checkpoints are saved
base_checkpoint_dir = "/mnt/c/Users/user/DD_Pipeline/model_training/smiles_gpt2"

# Ensure the log directory exists
log_dir = './training_logs'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set the eos_token as padding token

def load_and_tokenize_chunk(file_path):
    try:
        dataset = load_dataset('csv', data_files=file_path)
        def tokenize_function(examples):
            tokenized = tokenizer(examples['SMILES'], truncation=True, padding='max_length', max_length=500)
            tokenized['labels'] = tokenized['input_ids'].copy()
            return tokenized
        tokenized_dataset = dataset.map(tokenize_function, batched=True)
        split_dataset = tokenized_dataset['train'].train_test_split(test_size=0.15)
        return split_dataset
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

def log_losses(log_file_path, train_loss, val_loss=None):
    with open(log_file_path, 'a') as log_file:
        log_file.write(f"Training loss: {train_loss}\n")
        if val_loss:
            log_file.write(f"Validation loss: {val_loss}\n")

# Get all CSV files and sort them in the correct numerical order of chunks
try:
    csv_files = sorted([os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.csv')],
                       key=lambda x: int(x.split('chunk')[-1].split('.')[0]))
except Exception as e:
    print(f"Error sorting CSV files: {e}")
    csv_files = []

# Start training from chunk1.csv
for i, file_path in enumerate(csv_files, start=1):
    print(f"Processing file: {file_path}")
    split_dataset = load_and_tokenize_chunk(file_path)
    if split_dataset is None:
        continue

    # Dynamically load the latest checkpoint for each new training session
    latest_checkpoint_path = get_latest_checkpoint(base_checkpoint_dir)
    if latest_checkpoint_path:
        model = GPT2LMHeadModel.from_pretrained(latest_checkpoint_path)
    else:
        model = GPT2LMHeadModel.from_pretrained("gpt2")

    training_args = TrainingArguments(
        output_dir=base_checkpoint_dir,
        eval_strategy="epoch",
        learning_rate=5e-5,
        weight_decay=0.01,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        save_total_limit=2,
        resume_from_checkpoint=latest_checkpoint_path,
        logging_dir="./logs",
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=split_dataset['train'],
        eval_dataset=split_dataset['test'],
    )

    try:
        train_output = trainer.train()
        new_checkpoint_dir = os.path.join(base_checkpoint_dir, f"checkpoint-{i}")
        model.save_pretrained(new_checkpoint_dir)
        train_loss = train_output.training_loss
        log_file_path = os.path.join(log_dir, f"chunk_{i}_training_log.txt")
        log_losses(log_file_path, train_loss)
        print(f"Training completed on file {file_path}. Checkpoint saved at {new_checkpoint_dir}.")
    except Exception as e:
        print(f"Training failed for {file_path} due to {e}")
    finally:
        print("Waiting for 30 minutes before training on the next chunk...")
        time.sleep(1800)  # 10 minutes pause

print("Training completed on all files!")

  from .autonotebook import tqdm as notebook_tqdm
2024-09-10 16:47:18.936512: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-10 16:47:19.916446: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-10 16:47:20.266810: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-10 16:47:20.363238: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-10 16:47:21.0

Processing file: /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk1.csv
No checkpoints found, will start from base model.


Epoch,Training Loss,Validation Loss
1,0.0627,0.055595
2,0.0516,0.050487


Training completed on file /mnt/c/Users/user/DD_Pipeline/Training_SMILES/chunk1.csv. Checkpoint saved at /mnt/c/Users/user/DD_Pipeline/model_training/smiles_gpt2/checkpoint-1.
Waiting for 30 minutes before training on the next chunk...
