# Step 1 Import packages and data sets

In [1]:
# Process data
import pandas as pd
import re

# Train model
from transformers import BertTokenizer
import torch

movie_review = pd.read_csv('./movie_reviews/movie_reviews.csv')
test_dataset = pd.read_csv("./test_data.csv")

  from .autonotebook import tqdm as notebook_tqdm


# Step 2 Describe and clean data

In [2]:
# 1. Check the train dataset and test dataset
print(f'train dataset:\n{movie_review.head()}\n')
print(f'test dataset:\n{test_dataset.head()}')

train dataset:
                                                text  label
0  If you havent seen this movie than you need to...      1
1  but Cinderella gets my vote not only for the w...      0
2  This movie is pretty cheesy but I do give it c...      1
3  I have not seen a Van Damme flick for a while ...      1
4  This is a sleeper It defines Nicholas Cage The...      1

test dataset:
   Id                                               text
0   0  What can possibly said about this movie other ...
1   1  I dont care how many bad reviews purple rain g...
2   2  Ken Russell directed this weird  Not very  ero...
3   3  This is a great movie from the lost age of rea...
4   4  I have a problem with the movie snobs who cons...


In [3]:
# 2. Clean the train dataset and test dataset

# Function to clean the text data
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove leading and trailing spaces
    text = text.strip()
    return text

# Apply the cleaning function to the text column
movie_review['text_cleaned'] = movie_review['text'].apply(clean_text)
test_dataset['text_cleaned'] = test_dataset['text'].apply(clean_text)

train_dataset = movie_review

# Step 3 Process data with DistilBertTokenizer

In [4]:
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

# In the beginning I use the standard ways to train the model, but the result is not good enough.
# Then I tried to use the BERT model and BiLSTM model, but encountered the problem of overfitting, so I changed to DistilBERT model.

# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Function to encode the text data into tokens
def encode_reviews(reviews, labels, max_length):
    return tokenizer(reviews, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")

# Encode the training set
max_length = 128 # Maximum length of a review
encoded_train_data = encode_reviews(train_dataset['text_cleaned'].tolist(), train_dataset['label'].tolist(), max_length)

# Split the training set into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(train_dataset['text_cleaned'], train_dataset['label'], test_size=0.2)

# Encode the train and validation sets
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=max_length)

# Create a Dataset object
class MovieReviewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Convert the dataset into a Dataset object
train_dataset = MovieReviewsDataset(train_encodings, train_labels.tolist())                                      
val_dataset = MovieReviewsDataset(val_encodings, val_labels.tolist())

2023-11-08 14:47:10.360226: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-08 14:47:10.391754: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-08 14:47:10.537015: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-08 14:47:10.537035: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-08 14:47:10.537924: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

# Step 4 Train a DistilBert model

In [5]:
from transformers import DistilBertForSequenceClassification

# Load the DistilBERT model with a classification head
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Set the device to GPU (cuda) if available, otherwise stick with CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset             
)

trainer.train()

model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.4735
1000,0.3473
1500,0.3235
2000,0.3206
2500,0.2105
3000,0.2088
3500,0.204
4000,0.1846
4500,0.0953
5000,0.0922


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')

# Step 5 Evaluate the model

In [6]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Define the evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,           # accuracy_score(labels, preds)
        'f1': f1,                  # f1_score(labels, preds)
        'precision': precision,    # precision_score(labels, preds)
        'recall': recall           # recall_score(labels, preds)
    }

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Evaluate the model on the validation set
evaluation_result = trainer.evaluate()

print(evaluation_result)


{'eval_loss': 0.46378642320632935, 'eval_accuracy': 0.8935, 'eval_f1': 0.8953574060427413, 'eval_precision': 0.8883743602242262, 'eval_recall': 0.9024511017578608, 'eval_runtime': 3.1184, 'eval_samples_per_second': 2565.432, 'eval_steps_per_second': 40.085}


# Step 6 Predict the testdata

In [7]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

model_path = 'saved_model'

# Load the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path).to('cuda')  

test_data = test_dataset['text_cleaned'].tolist()  

# Process the testdata in several batches
batch_size = 100

# Store the predicted labels and scores
predicted_labels = []
predicted_scores = []

# Process the test data in batches
for i in range(0, len(test_data), batch_size):
    batch = test_data[i:i + batch_size]  
    inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to('cuda')  # 处理数据并移到GPU上

    with torch.no_grad():
        outputs = model(**inputs)

    # convert logits to probabilities
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)

    # obtain the predicted labels and scores
    predictions = torch.argmax(probabilities, dim=1)
    scores = probabilities[torch.arange(probabilities.size(0)), predictions]

    # put the predicted labels and scores to the lists
    predicted_labels.extend(predictions.tolist())
    predicted_scores.extend(scores.tolist())
    

# Add the predicted labels and scores to the test dataset
test_dataset['predicted_label'] = predicted_labels
test_dataset = test_dataset.drop(columns=['text_cleaned','text'])
test_dataset.to_csv('test_data_with_predictions.csv', index=False)
