In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the dataset
file_path = '/content/unfdata_modified1 (1).csv'
df = pd.read_csv(file_path)

# Data Preprocessing
# Filter the relevant columns
df = df[['utterance', 'label']]

# Train-Test Split
train_texts, val_texts, train_labels, val_labels = train_test_split(df['utterance'], df['label'], test_size=0.2, random_state=42)

# Load Pre-trained Tokenizer and Model
# Note: Using gpt2 here, but OpenAI’s GPT-4 would require API access via the openai library
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Check and add padding token if missing
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Set padding token for the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Load the model and assign the padding token
model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=2)  # Binary classification
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenize Data with padding
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)

# Convert data to PyTorch tensors
class ASDClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ASDClassificationDataset(train_encodings, train_labels.tolist())
val_dataset = ASDClassificationDataset(val_encodings, val_labels.tolist())

# Define Evaluation Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Set up Trainer with reporting disabled
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",  # Updated from 'evaluation_strategy' to 'eval_strategy'
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none"  # Disable W&B logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the Model
trainer.train()

# Evaluate the Model
results = trainer.evaluate()
print(results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1351,0.100476,0.984117,0.978459,0.972198,0.984801
2,0.0786,0.107491,0.983789,0.977867,0.978086,0.977649
3,0.0792,0.104936,0.983953,0.978086,0.978523,0.977649


{'eval_loss': 0.10493627190589905, 'eval_accuracy': 0.9839528410021287, 'eval_f1': 0.9780858676207513, 'eval_precision': 0.978523489932886, 'eval_recall': 0.9776486365668305, 'eval_runtime': 14.9976, 'eval_samples_per_second': 407.197, 'eval_steps_per_second': 50.941, 'epoch': 3.0}


In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the dataset
file_path = '/content/unfdata_modified1 (1).csv'
df = pd.read_csv(file_path)

# Data Preprocessing: Keep only the relevant columns
df = df[['utterance', 'label']]

# Split the dataset into training, validation, and test sets
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['utterance'], df['label'], test_size=0.3, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42
)

# Load Pre-trained Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Add padding token if missing
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token

# Load the model and assign padding token
model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenize data with padding
def encode_texts(texts):
    encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=128, return_tensors="pt")
    return encodings['input_ids'], encodings['attention_mask']

train_inputs, train_masks = encode_texts(train_texts)
val_inputs, val_masks = encode_texts(val_texts)
test_inputs, test_masks = encode_texts(test_texts)

# Convert labels to tensors
train_labels = torch.tensor(train_labels.values)
val_labels = torch.tensor(val_labels.values)
test_labels = torch.tensor(test_labels.values)

# Create a custom Dataset class
class ASDClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, masks, labels):
        self.inputs = inputs
        self.masks = masks
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx],
            'attention_mask': self.masks[idx],
            'labels': self.labels[idx]
        }

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = ASDClassificationDataset(train_inputs, train_masks, train_labels)
val_dataset = ASDClassificationDataset(val_inputs, val_masks, val_labels)
test_dataset = ASDClassificationDataset(test_inputs, test_masks, test_labels)

# Define Evaluation Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    report_to="none"  # Disable W&B logging
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the Model
trainer.train()

# Evaluate on the Validation Set
val_results = trainer.evaluate()
print("Validation Results:", val_results)

# Evaluate on the Test Set
test_results = trainer.evaluate(test_dataset=test_dataset)
print("Test Results:", test_results)


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.097,0.09368,0.984716,0.978839,0.977067,0.980618


KeyboardInterrupt: 

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the dataset
file_path = '/content/DL_data.csv'
df = pd.read_csv(file_path)

# Data Preprocessing: Keep only the relevant columns
df = df[['utterance', 'label']]

# Split the dataset into training, validation, and test sets
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['utterance'], df['label'], test_size=0.3, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42
)

# Load Pre-trained Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Add padding token if missing
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token

# Load the model and assign padding token
model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenize data with padding
def encode_texts(texts):
    encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=128, return_tensors="pt")
    return encodings['input_ids'], encodings['attention_mask']

train_inputs, train_masks = encode_texts(train_texts)
val_inputs, val_masks = encode_texts(val_texts)
test_inputs, test_masks = encode_texts(test_texts)

# Convert labels to tensors
train_labels = torch.tensor(train_labels.values)
val_labels = torch.tensor(val_labels.values)
test_labels = torch.tensor(test_labels.values)

# Create a custom Dataset class
class ASDClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, masks, labels):
        self.inputs = inputs
        self.masks = masks
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx],
            'attention_mask': self.masks[idx],
            'labels': self.labels[idx]
        }

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = ASDClassificationDataset(train_inputs, train_masks, train_labels)
val_dataset = ASDClassificationDataset(val_inputs, val_masks, val_labels)
test_dataset = ASDClassificationDataset(test_inputs, test_masks, test_labels)

# Define Evaluation Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none"  # Disable W&B logging
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the Model
trainer.train()

# Evaluate on the Validation Set
val_results = trainer.evaluate()
print("Validation Results:", val_results)

# Evaluate on the Test Set
test_results = trainer.evaluate(test_dataset=test_dataset)
print("Test Results:", test_results)


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the dataset
file_path = '/content/findata.csv'
df = pd.read_csv(file_path)

# Data Preprocessing: Keep only the relevant columns
df = df[['utterance', 'label']]

# Split the dataset into training, validation, and test sets
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['utterance'], df['label'], test_size=0.3, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42
)

# Load Pre-trained Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Add padding token if missing
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token

# Load the model and assign padding token
model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.config.pad_token_id = tokenizer.pad_token_id

# Tokenize data with padding
def encode_texts(texts):
    encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=128, return_tensors="pt")
    return encodings['input_ids'], encodings['attention_mask']

train_inputs, train_masks = encode_texts(train_texts)
val_inputs, val_masks = encode_texts(val_texts)
test_inputs, test_masks = encode_texts(test_texts)

# Convert labels to tensors
train_labels = torch.tensor(train_labels.values)
val_labels = torch.tensor(val_labels.values)
test_labels = torch.tensor(test_labels.values)

# Create a custom Dataset class
class ASDClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, masks, labels):
        self.inputs = inputs
        self.masks = masks
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx],
            'attention_mask': self.masks[idx],
            'labels': self.labels[idx]
        }

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = ASDClassificationDataset(train_inputs, train_masks, train_labels)
val_dataset = ASDClassificationDataset(val_inputs, val_masks, val_labels)
test_dataset = ASDClassificationDataset(test_inputs, test_masks, test_labels)

# Define Evaluation Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    report_to="none"  # Disable W&B logging
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the Model
trainer.train()

# Evaluate on the Validation Set
val_results = trainer.evaluate()
print("Validation Results:", val_results)

# Evaluate on the Test Set
test_results = trainer.evaluate(test_dataset=test_dataset)
print("Test Results:", test_results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4277,0.420561,0.8168,0.747059,0.738086,0.756252
2,0.38,0.420218,0.820138,0.749778,0.746313,0.753275


Validation Results: {'eval_loss': 0.42021775245666504, 'eval_accuracy': 0.8201377547397571, 'eval_f1': 0.7497777338733577, 'eval_precision': 0.7463126843657817, 'eval_recall': 0.7532751091703057, 'eval_runtime': 25.7217, 'eval_samples_per_second': 547.514, 'eval_steps_per_second': 68.464, 'epoch': 2.0}


TypeError: Trainer.evaluate() got an unexpected keyword argument 'test_dataset'

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

file_path = '/content/findata.csv'
df = pd.read_csv(file_path)


df = df[['utterance', 'label']]

# Split dataset
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['utterance'], df['label'], test_size=0.3, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42
)

# Pre-trained Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Add padding
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token

# Load the model and set the padding token
model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

# Function to tokenize texts
def encode_texts(texts):
    encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=128, return_tensors="pt")
    return encodings['input_ids'], encodings['attention_mask']

# Tokenize data
train_inputs, train_masks = encode_texts(train_texts)
val_inputs, val_masks = encode_texts(val_texts)
test_inputs, test_masks = encode_texts(test_texts)

# Convert labels to tensors
train_labels = torch.tensor(train_labels.values)
val_labels = torch.tensor(val_labels.values)
test_labels = torch.tensor(test_labels.values)

# Create a custom Dataset class
class ASDClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, masks, labels):
        self.inputs = inputs
        self.masks = masks
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx],
            'attention_mask': self.masks[idx],
            'labels': self.labels[idx]
        }

    def __len__(self):
        return len(self.labels)


train_dataset = ASDClassificationDataset(train_inputs, train_masks, train_labels)
val_dataset = ASDClassificationDataset(val_inputs, val_masks, val_labels)
test_dataset = ASDClassificationDataset(test_inputs, test_masks, test_labels)

# Evaluation Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    report_to="none"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

val_results = trainer.evaluate()
print("Validation Results:", val_results)

test_results = trainer.evaluate(test_dataset=test_dataset)
print("Test Results:", test_results)

# Function to predict the label of a single sentence
def predict_single_sentence(sentence):
    # Tokenize and encode the sentence
    inputs = tokenizer(sentence, truncation=True, padding=True, max_length=128, return_tensors="pt")
    input_ids = inputs['input_ids'].to(model.device)
    attention_mask = inputs['attention_mask'].to(model.device)


    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_class = logits.argmax(-1).item()

    return "Label 1" if predicted_class == 1 else "Label 0"

example_sentence = "shes not fit"
predicted_label = predict_single_sentence(example_sentence)
print(f"The predicted label for the example sentence is: {predicted_label}")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4316,0.424399,0.815593,0.749349,0.729288,0.770544
