In [2]:
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import numpy as np
!pip install evaluate
!pip install accelerate -U
import evaluate  # Assuming this is a custom evaluation script
df=pd.read_csv('/kaggle/input/imdb-dataset/train.csv')
df_test=pd.read_csv('/kaggle/input/imdb-dataset/test.csv')
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Function to process each row of the DataFrame
def process_data(row):
    text = str(row['review']).strip()  # Remove leading/trailing spaces
    encodings = tokenizer(text, padding="max_length", truncation=True, max_length=128)
    label = 1 if row['sentiment'] == 'positive' else 0  # Encode label
    encodings['labels'] = label
    return encodings

# Process train and test data
train_data = []
test_data = []

# Assuming df and df_test are already defined DataFrames
for i in range(len(df)):
    train_data.append(process_data(df.iloc[i]))

for i in range(len(df_test)):
    test_data.append(process_data(df_test.iloc[i]))

# Convert processed data to DataFrames
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

# Convert DataFrames to Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)  # Assuming binary classification

# Define evaluation metric
metric = evaluate.load("accuracy")  # Assuming this is a custom evaluation function

# Function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Training arguments
training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=7,  # Example number of epochs
    per_device_train_batch_size=8,  # Example batch size
    per_device_eval_batch_size=8,  # Example batch size
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=500,  # Log every 500 steps
    save_total_limit=2,  # Example number of checkpoints to save
    load_best_model_at_end=True,  # Load the best model at the end of training
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3572,0.344028,0.8742
2,0.209,0.353266,0.88065
3,0.1109,0.57689,0.87935
4,0.0546,0.649204,0.88215
5,0.0269,0.689624,0.8826
6,0.0147,0.819518,0.88335
7,0.0066,0.887125,0.88395


Checkpoint destination directory test_trainer/checkpoint-1875 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=13125, training_loss=0.1114234864007859, metrics={'train_runtime': 2426.0995, 'train_samples_per_second': 86.559, 'train_steps_per_second': 5.41, 'total_flos': 6954538429440000.0, 'train_loss': 0.1114234864007859, 'epoch': 7.0})

In [3]:
trainer.evaluate()



{'eval_loss': 0.34402841329574585,
 'eval_accuracy': 0.8742,
 'eval_runtime': 65.8892,
 'eval_samples_per_second': 303.54,
 'eval_steps_per_second': 18.971,
 'epoch': 7.0}

In [14]:
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import numpy as np
!pip install evaluate
import evaluate  # Assuming this is a custom evaluation script
df=pd.read_csv('/kaggle/input/imdb-dataset/train.csv')
df_test=pd.read_csv('/kaggle/input/imdb-dataset/test.csv')
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

# Function to process each row of the DataFrame
def process_data(row):
    text = str(row['review']).strip()  # Remove leading/trailing spaces
    encodings = tokenizer(text, padding="max_length", truncation=True, max_length=128)
    label = 1 if row['sentiment'] == 'positive' else 0  # Encode label
    encodings['labels'] = label
    return encodings

# Process train and test data
train_data = []
test_data = []

# Assuming df and df_test are already defined DataFrames
for i in range(len(df)):
    train_data.append(process_data(df.iloc[i]))

for i in range(len(df_test)):
    test_data.append(process_data(df_test.iloc[i]))

# Convert processed data to DataFrames
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

# Convert DataFrames to Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load model
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)  # Assuming binary classification

# Define evaluation metric
metric = evaluate.load("accuracy")  # Assuming this is a custom evaluation function

# Function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Training arguments
training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=7,  # Example number of epochs
    per_device_train_batch_size=8,  # Example batch size
    per_device_eval_batch_size=8,  # Example batch size
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=500,  # Log every 500 steps
    save_total_limit=2,  # Example number of checkpoints to save
    load_best_model_at_end=True,  
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3728,0.424648,0.87835
2,0.2734,0.375863,0.88845
3,0.2073,0.350377,0.89645
4,0.1529,0.432253,0.8969
5,0.1028,0.469919,0.89905
6,0.0693,0.515739,0.9002
7,0.0442,0.531427,0.90095


Checkpoint destination directory test_trainer/checkpoint-1875 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory test_trainer/checkpoint-5625 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=13125, training_loss=0.17468687976655506, metrics={'train_runtime': 4815.1653, 'train_samples_per_second': 43.612, 'train_steps_per_second': 2.726, 'total_flos': 1.38133304064e+16, 'train_loss': 0.17468687976655506, 'epoch': 7.0})

In [15]:
trainer.evaluate()



{'eval_loss': 0.35037747025489807,
 'eval_accuracy': 0.89645,
 'eval_runtime': 119.6847,
 'eval_samples_per_second': 167.106,
 'eval_steps_per_second': 10.444,
 'epoch': 7.0}

In [16]:
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import numpy as np
!pip install evaluate
!pip install accelerate -U
import evaluate  # Assuming this is a custom evaluation script
df=pd.read_csv('/kaggle/input/imdb-dataset/train.csv')
df_test=pd.read_csv('/kaggle/input/imdb-dataset/test.csv')
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Function to process each row of the DataFrame
def process_data(row):
    text = str(row['review']).strip()  # Remove leading/trailing spaces
    encodings = tokenizer(text, padding="max_length", truncation=True, max_length=128)
    label = 1 if row['sentiment'] == 'positive' else 0  # Encode label
    encodings['labels'] = label
    return encodings

# Process train and test data
train_data = []
test_data = []

# Assuming df and df_test are already defined DataFrames
for i in range(len(df)):
    train_data.append(process_data(df.iloc[i]))

for i in range(len(df_test)):
    test_data.append(process_data(df_test.iloc[i]))

# Convert processed data to DataFrames
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

# Convert DataFrames to Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)  # Assuming binary classification

# Define evaluation metric
metric = evaluate.load("accuracy")  # Assuming this is a custom evaluation function

# Function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Training arguments
training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=7,  # Example number of epochs
    per_device_train_batch_size=8,  # Example batch size
    per_device_eval_batch_size=8,  # Example batch size
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=500,  # Log every 500 steps
    save_total_limit=2,  # Example number of checkpoints to save
    load_best_model_at_end=True,  # Load the best model at the end of training
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.359,0.386807,0.8721
2,0.2092,0.39675,0.87775
3,0.113,0.567308,0.8777
4,0.054,0.667604,0.8782
5,0.0272,0.694936,0.8801
6,0.0146,0.951222,0.8803
7,0.0074,1.011604,0.8828




TrainOutput(global_step=13125, training_loss=0.11206466689336868, metrics={'train_runtime': 2411.6786, 'train_samples_per_second': 87.076, 'train_steps_per_second': 5.442, 'total_flos': 6954538429440000.0, 'train_loss': 0.11206466689336868, 'epoch': 7.0})

In [17]:
trainer.evaluate()



{'eval_loss': 0.38680654764175415,
 'eval_accuracy': 0.8721,
 'eval_runtime': 65.6261,
 'eval_samples_per_second': 304.757,
 'eval_steps_per_second': 19.047,
 'epoch': 7.0}

In [6]:
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import AdamW

from datasets import Dataset
import numpy as np
!pip install evaluate
!pip install accelerate -U
import evaluate  # Assuming this is a custom evaluation script
df=pd.read_csv('/kaggle/input/imdb-dataset/train.csv')
df_test=pd.read_csv('/kaggle/input/imdb-dataset/test.csv')
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
optimizer = AdamW(model.parameters(), lr=0.01)

# Function to process each row of the DataFrame
def process_data(row):
    text = str(row['review']).strip()  # Remove leading/trailing spaces
    encodings = tokenizer(text, padding="max_length", truncation=True, max_length=128)
    label = 1 if row['sentiment'] == 'positive' else 0  # Encode label
    encodings['labels'] = label
    return encodings

# Process train and test data
train_data = []
test_data = []

# Assuming df and df_test are already defined DataFrames
for i in range(len(df)):
    train_data.append(process_data(df.iloc[i]))

for i in range(len(df_test)):
    test_data.append(process_data(df_test.iloc[i]))

# Convert processed data to DataFrames
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

# Convert DataFrames to Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)  # Assuming binary classification

# Define evaluation metric
metric = evaluate.load("accuracy")  # Assuming this is a custom evaluation function

# Function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Training arguments
training_args = TrainingArguments(
    output_dir="test_trainer",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=7,  # Example number of epochs
    per_device_train_batch_size=8,  # Example batch size
    per_device_eval_batch_size=8,  # Example batch size
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=500,  # Log every 500 steps
    save_total_limit=2,  # Example number of checkpoints to save
    load_best_model_at_end=True,  # Load the best model at the end of training
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None),
)

# Start training
trainer.train()



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6962,0.696603,0.49675
2,0.6962,0.696603,0.49675
3,0.6963,0.696603,0.49675
4,0.6961,0.696603,0.49675
5,0.696,0.696603,0.49675
6,0.696,0.696603,0.49675
7,0.6958,0.696603,0.49675


Checkpoint destination directory test_trainer/checkpoint-1875 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=13125, training_loss=0.6960745442708334, metrics={'train_runtime': 2160.1705, 'train_samples_per_second': 97.215, 'train_steps_per_second': 6.076, 'total_flos': 6954538429440000.0, 'train_loss': 0.6960745442708334, 'epoch': 7.0})

In [4]:
import torch
from transformers import GPT2Config, GPT2Tokenizer, GPT2ForSequenceClassification, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import time
import pandas as pd
!pip install evaluate
!pip install accelerate -U
train_df = pd.read_csv('/kaggle/input/imdb-dataset/train.csv')
test_df = pd.read_csv('/kaggle/input/imdb-dataset/test.csv')
train_df.head()

model_config = GPT2Config.from_pretrained("gpt2", num_labels=2) 
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_tokenizer.padding_side = "left"
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token  
gpt2_model = GPT2ForSequenceClassification.from_pretrained("gpt2", config=model_config) 
gpt2_model.resize_token_embeddings(len(gpt2_tokenizer))

gpt2_model.config.pad_token_id = gpt2_model.config.eos_token_id
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

train_encodings = gpt2_tokenizer(train_df['review'].tolist(), truncation=True, padding=True)
test_encodings = gpt2_tokenizer(test_df['review'].tolist(), truncation=True, padding=True)
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)  # Convert 'positive' to 1, 'negative' to 0
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)  # Convert 'positive' to 1, 'negative' to 0
})
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',       
    num_train_epochs=1,              
    per_device_train_batch_size=5,  
    per_device_eval_batch_size=5,    
    logging_dir='./logs',           
    logging_steps=500,               
    evaluation_strategy="epoch",      
    save_steps=500,                  
    overwrite_output_dir=True        
)
from transformers import Trainer

trainer = Trainer(
    model=gpt2_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.1
Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.26.1
    Uninstalling accelerate-0.26.1:
      Successfully uninstalled accelerate-0.26.1
Successfully installed accelerate-0.27.2


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [5]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss,Accuracy
1,0.2426,0.222344,0.9271




TrainOutput(global_step=3000, training_loss=0.31857914733886716, metrics={'train_runtime': 5293.6219, 'train_samples_per_second': 5.667, 'train_steps_per_second': 0.567, 'total_flos': 1.567780503552e+16, 'train_loss': 0.31857914733886716, 'epoch': 1.0})

In [6]:
trainer.evaluate()



{'eval_loss': 0.22234448790550232,
 'eval_accuracy': 0.9271,
 'eval_runtime': 1073.0577,
 'eval_samples_per_second': 18.638,
 'eval_steps_per_second': 1.864,
 'epoch': 1.0}