In [10]:
import pandas as pd
import torch
import pickle
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import concurrent.futures

# Load and preprocess the dataset
df = pd.read_csv('dataset.csv').dropna(subset=['review_text'])
df = df.sample(frac=1/6, random_state=42)
df['review_text'] = df['review_text'].astype(str)

# Split the dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['review_text'], df['review_score'], stratify=df['review_score'], test_size=0.2
)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize a batch of texts
def batch_tokenize(texts):
    return tokenizer(texts, truncation=True, padding=True, max_length=128)

# Function to process texts in parallel
def parallel_tokenize(texts, chunk_size=100):
    # Split texts into chunks
    chunks = [texts[i:i + chunk_size] for i in range(0, len(texts), chunk_size)]
    
    # Process each chunk in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(executor.map(batch_tokenize, chunks))
    
    # Initialize combined results
    combined_results = {key: [] for key in results[0]}
    for result in results:
        for key in result:
            combined_results[key].extend(result[key])
    
    return combined_results



: 

In [None]:
# Tokenize the texts in parallel
train_encodings = parallel_tokenize(list(train_texts))
val_encodings = parallel_tokenize(list(val_texts))




In [None]:
# Convert encodings and labels into a PyTorch dataset
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ReviewDataset(train_encodings, train_labels.tolist())
val_dataset = ReviewDataset(val_encodings, val_labels.tolist())



In [None]:


# Function to save encodings to disk
def save_encodings(encodings, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(encodings, file)

# Save train and validation encodings
save_encodings(train_encodings, 'train_encodings_random.pkl')
save_encodings(val_encodings, 'val_encodings_random.pkl')

def save_labels(labels, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(labels, file)

# Save train and validation labels
save_labels(train_labels.tolist(), 'train_labels_random.pkl')
save_labels(val_labels.tolist(), 'val_labels_random.pkl')

In [None]:
import pandas as pd
import torch
import pickle
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import concurrent.futures

# Function to load encodings from disk
def load_encodings(file_path):
    with open(file_path, 'rb') as file:
        encodings = pickle.load(file)
    return encodings

# Load train and validation encodings
train_encodings = load_encodings('train_encodings_random.pkl')
val_encodings = load_encodings('val_encodings_random.pkl')

# Function to load labels from disk
def load_labels(file_path):
    with open(file_path, 'rb') as file:
        labels = pickle.load(file)
    return labels

# Load train and validation labels
train_labels = load_labels('train_labels_random.pkl')
val_labels = load_labels('val_labels_random.pkl')

In [None]:

class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)
def replace_negative_ones(labels):
    return [0 if label == -1 else label for label in labels]

# Apply the function to your label lists
train_labels_corrected = replace_negative_ones(train_labels)
val_labels_corrected = replace_negative_ones(val_labels)

# Then, you can proceed to use these corrected labels for your dataset creation
train_dataset = ReviewDataset(train_encodings, train_labels_corrected)
val_dataset = ReviewDataset(val_encodings, val_labels_corrected)

In [None]:
import shutil
import os

# Get disk usage statistics about the root directory
# You can replace '/' with another directory if needed
total, used, free = shutil.disk_usage("/")

print("Disk space statistics:")
print(f"Total: {total / (2**30):.2f} GB")
print(f"Used: {used / (2**30):.2f} GB")
print(f"Free: {free / (2**30):.2f} GB")

Disk space statistics:
Total: 1863.00 GB
Used: 473.01 GB
Free: 1389.99 GB


In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import os
from matplotlib import pyplot as plt
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
df = pd.read_csv('dataset.csv').dropna(subset=['review_text'])
# Assuming the previous steps for preparing the dataset and model are the same...
def find_last_checkpoint(output_dir):
    """
    Finds the last checkpoint directory in the specified output directory.
    Assumes checkpoint directories follow the format 'checkpoint-xxxx'.
    """
    checkpoints = [os.path.join(output_dir, d) for d in os.listdir(output_dir) if d.startswith("checkpoint")]
    if not checkpoints:
        return None
    return max(checkpoints, key=os.path.getmtime)

last_checkpoint = find_last_checkpoint('./results')

if last_checkpoint:
    print(f"Loading model from last checkpoint: {last_checkpoint}")
    model = BertForSequenceClassification.from_pretrained(last_checkpoint, num_labels=len(df['review_score'].unique()))
    model = model.to(device)
    x = True
else:
    print("No checkpoints found, initializing from base model.")
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['review_score'].unique()))
    model = model.to(device)

    x = False
# Training arguments with increased epochs
training_args = TrainingArguments(
    output_dir='./results',          # Output directory to save checkpoints
    num_train_epochs=10,              # Increased number of training epochs
    per_device_train_batch_size=128,  # Batch size for training
    per_device_eval_batch_size=128,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_strategy= "epoch",                # How often to log loss values
    evaluation_strategy="epoch",     # Evaluate each epoch
    save_strategy="epoch",           # Save the model every epoch
    load_best_model_at_end=True,     # Load the best model at the end of training
)

# Re-initialize the Trainer with the updated training arguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Start training (or resume from the last checkpoint)
trainer.train(resume_from_checkpoint=x)



# After training, evaluate the model
evaluation_results = trainer.evaluate()
print(evaluation_results)

train_loss= []
seen = []
for elem in trainer.state.log_history:
    if 'loss' in elem.keys():
        if elem['epoch'].is_integer() and elem["epoch"] not in seen:
            train_loss.append(elem['loss'])
            seen.append(elem["epoch"])

val_loss = []
for elem in trainer.state.log_history:
    if 'eval_loss' in elem.keys():
        val_loss.append(elem['eval_loss'])


plt.figure(figsize=(10, 5))
plt.plot(train_loss, label='Training Loss')
plt.plot(val_loss, color = "red", label='Validation Loss')
plt.title('Training and Validation Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

MemoryError: 

In [None]:
from transformers import pipeline
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
# Load the trained model and tokenizer into a pipeline
# Make sure to adjust the model path if you've saved your model elsewhere
model_path = './results\checkpoint-534150'  # This path should point to the directory containing your saved model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Create a text classification pipeline
classifier = pipeline("text-classification", model=model_path, tokenizer=tokenizer)

# Example text to classify
texts = ["Bad graphics bad gameplay awful sound design"]

# Make predictions
predictions = classifier(texts)

# Process and print predictions
for text, prediction in zip(texts, predictions):
    print(f"Text: {text}\nPredicted label: {prediction['label']} with score: {prediction['score']:.4f}\n")

Text: Bad graphics bad gameplay awful sound design
Predicted label: LABEL_0 with score: 0.9585

