In [1]:
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report

2024-02-15 20:45:18.606463: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def load_data(folder_path, queries=None):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):  # Ensuring to read only text files
            try:
                with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                    content = file.read()
                    if queries:
                        for query in queries:
                            if query.lower() in content.lower():
                                data.append(content)
                                break  # Breaks to avoid duplicate entries for a file matching multiple queries
                    else:
                        data.append(content)
            except UnicodeDecodeError as e:
                print(f"Error decoding file {filename}: {str(e)}")
            except Exception as e:
                print(f"Error reading file {filename}: {str(e)}")
    return data

In [3]:
# Paths to your data folders
# current_dir = os.getcwd()  # Changed to getcwd for compatibility with notebooks
current_dir = os.getcwd()
dataset_folder = os.path.join(current_dir, '')
train_pos_path = os.path.join(dataset_folder, 'train', 'pos')
train_neg_path = os.path.join(dataset_folder, 'train', 'neg')
test_pos_path = os.path.join(dataset_folder, 'test', 'pos')
test_neg_path = os.path.join(dataset_folder, 'test', 'neg')

In [4]:
queries_to_search = ["great", "disappointing", "awesome"]
train_pos = load_data(train_pos_path, queries_to_search)
train_neg = load_data(train_neg_path, queries_to_search)
test_pos = load_data(test_pos_path, queries_to_search)
test_neg = load_data(test_neg_path, queries_to_search)

In [5]:
# New model name
model_name = 'nlptown/bert-base-multilingual-uncased-sentiment'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


In [6]:
# Assuming train_pos, train_neg, test_pos, test_neg are loaded as in the previous steps

# Assign labels: 1 for positive, 0 for negative
train_labels = [1] * len(train_pos) + [0] * len(train_neg)
test_labels = [1] * len(test_pos) + [0] * len(test_neg)

# Concatenate positive and negative reviews for training and testing
train_texts = train_pos + train_neg
test_texts = test_pos + test_neg

# Tokenize the texts using the DistilBERT tokenizer
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)


In [7]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [8]:
train_dataset = SentimentDataset(train_encodings, train_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [9]:
training_args = TrainingArguments(
    output_dir='./results',  # output directory
    num_train_epochs=3,  # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,  # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs',  # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,  # the instantiated Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=test_dataset  # evaluation dataset
)

trainer.train()

  return torch._C._cuda_getDeviceCount() > 0
Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
10,3.8537
20,3.2314
30,2.0053
40,1.092
50,0.7382
60,0.5023
70,0.3514
80,0.2021
90,0.1504
100,0.1428


TrainOutput(global_step=174, training_loss=0.7464527376424307, metrics={'train_runtime': 888.1466, 'train_samples_per_second': 3.128, 'train_steps_per_second': 0.196, 'total_flos': 730942199764992.0, 'train_loss': 0.7464527376424307, 'epoch': 3.0})

In [10]:
# Evaluate the model on the test set
result = trainer.evaluate()
print(result)

# Get predictions for the test set
predictions = trainer.predict(test_dataset=test_dataset)
predicted_classes = predictions.predictions.argmax(axis=1)
true_labels = test_labels

# Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_classes)
print(f"Accuracy: {accuracy}")

# Generate classification report
class_names = ['Negative', 'Positive']
print(classification_report(true_labels, predicted_classes, target_names=class_names))

{'eval_loss': 0.1899251788854599, 'eval_runtime': 54.2644, 'eval_samples_per_second': 17.636, 'eval_steps_per_second': 1.106, 'epoch': 3.0}
Accuracy: 0.9508881922675027
              precision    recall  f1-score   support

    Negative       0.99      0.90      0.94       411
    Positive       0.93      0.99      0.96       546

    accuracy                           0.95       957
   macro avg       0.96      0.94      0.95       957
weighted avg       0.95      0.95      0.95       957

