In [1]:
# pip install transformers

In [2]:
import transformers

In [3]:
# Import necessary libraries
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# Block 1: Loading and Inspecting Data
# Load the cleaned destination data
new_data_path = 'cleaned_destination_data.csv'
new_data = pd.read_csv(new_data_path)

# Inspect the dataset structure
print("Dataset Structure:")
print(new_data.head())

Dataset Structure:
                             name       lat        lng  \
0                Arugam Bay Beach  6.840408  81.836848   
1                   Mirissa Beach  5.944703  80.459161   
2  Weligama Beach (surf and stay)  5.972486  80.435714   
3                        Ahangama  5.973975  80.362159   
4                 Hikkaduwa Beach  6.137727  80.099060   

             formatted_address  rating  user_ratings_total  \
0  Arugam Bay Beach, Sri Lanka     4.8              1591.0   
1           Mirissa, Sri Lanka     4.6              1748.0   
2          Weligama, Sri Lanka     4.4               325.0   
3          Ahangama, Sri Lanka     NaN                 NaN   
4   Hikkaduwa Beach, Sri Lanka     4.7              1438.0   

                                      latest_reviews  \
0  ['Arugam Bay Beach is a surfer's paradise! I s...   
1  ['Mirissa Beach is truly a gem on Sri LankaÃ¢Â...   
2  ['Weligama Beach is a fantastic spot for both ...   
3  ['Ahangama was a bit disappointi

In [4]:
# Block 2: Deriving Sentiment Labels
# In this case, we're simply labeling based on keyword 'excellent'.
new_data['sentiment'] = new_data['cleaned_reviews'].apply(
    lambda x: 'positive' if 'excellent' in x.lower() else 'negative'
)

# Check if sentiment labels were added correctly
print("Sentiment Labels:")
print(new_data[['cleaned_reviews', 'sentiment']].head())


Sentiment Labels:
                                     cleaned_reviews sentiment
0  Arugam Bay Beach is a surfers paradise I spent...  positive
1  Mirissa Beach is truly a gem on Sri Lanka s so...  negative
2  Weligama Beach is a fantastic spot for both be...  negative
3  Ahangama was a bit disappointing for me as a s...  negative
4  Hikkaduwa Beach is a delightful escape for sol...  negative


In [5]:
# Block 3: Tokenization Using DistilBERT
# Use the BERT tokenizer on 'cleaned_reviews'
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

# Tokenize the reviews
encodings = tokenizer(
    list(new_data['cleaned_reviews']),
    truncation=True,
    padding=True,
    max_length=400,
    return_tensors='pt'
)

# Print some encodings to verify
print("Sample Encodings:")
print(encodings['input_ids'][:5])


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Sample Encodings:
tensor([[  101, 12098, 16377,  ...,     0,     0,     0],
        [  101, 14719, 21205,  ...,     0,     0,     0],
        [  101,  2057, 14715,  ...,     0,     0,     0],
        [  101,  6289, 18222,  ...,     0,     0,     0],
        [  101,  7632, 15714,  ...,     0,     0,     0]])


In [6]:
# Block 4: Mapping Sentiment Labels to Integers
# Assuming 'sentiment' has the labels: 'positive', 'negative'
label_mapping = {'positive': 1, 'negative': 0}
new_data['sentiment_mapped'] = new_data['sentiment'].map(label_mapping)

# Verify the mapped labels
print("Mapped Sentiment Labels:")
print(new_data[['sentiment', 'sentiment_mapped']].head())


Mapped Sentiment Labels:
  sentiment  sentiment_mapped
0  positive                 1
1  negative                 0
2  negative                 0
3  negative                 0
4  negative                 0


In [7]:
# Block 5: Creating TensorDataset and Splitting Data
# Create a TensorDataset for the tokenized data
dataset = TensorDataset(
    encodings['input_ids'], 
    encodings['attention_mask'], 
    torch.tensor(new_data['sentiment_mapped'].values)
)

# Split the dataset into training and testing sets (80/20 split)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Verify dataset sizes
print(f"Training Set Size: {train_size}")
print(f"Testing Set Size: {test_size}")


Training Set Size: 318
Testing Set Size: 80


In [8]:
# Block 6: Creating DataLoader for Batching
# Create DataLoaders for the train and test datasets
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

# Print number of batches
print(f"Number of batches in train loader: {len(train_loader)}")
print(f"Number of batches in test loader: {len(test_loader)}")


Number of batches in train loader: 40
Number of batches in test loader: 10


In [9]:
# Block 7: Model Setup (DistilBERT)
# Load the DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=len(train_loader) * 3
)

# Print model setup
print("Model and Optimizer Setup Complete.")


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and Optimizer Setup Complete.


In [10]:
# Block 8: Training Loop
# Set model to training mode
model.train()

# Define training parameters
epochs = 3
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    total_loss = 0
    
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        
        # Clear previously calculated gradients
        model.zero_grad()
        
        # Forward pass: Compute predictions
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass: Compute gradient of the loss with respect to model parameters
        loss.backward()
        
        # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update parameters and learning rate
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f'Average Training Loss: {avg_train_loss:.3f}')


Epoch 1/3
Average Training Loss: 0.147
Epoch 2/3
Average Training Loss: 0.134
Epoch 3/3
Average Training Loss: 0.130


In [11]:
# Block 9: Evaluation
# Switch model to evaluation mode
model.eval()

# Tracking variables
test_loss = 0
correct_predictions = 0

# No need to track gradients during evaluation
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        
        # Forward pass: Get predictions
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        
        test_loss += loss.item()
        
        # Move logits and labels to CPU for further computation
        predictions = torch.argmax(logits, dim=1)
        correct_predictions += (predictions == labels).sum().item()

# Calculate average loss and accuracy
avg_test_loss = test_loss / len(test_loader)
accuracy = correct_predictions / len(test_dataset)

print(f"Test Loss: {avg_test_loss:.3f}")
print(f"Accuracy: {accuracy:.3f}")


Test Loss: 0.188
Accuracy: 0.963


In [13]:
torch.save(model.state_dict(), 'model.pth')

In [None]:
# Load the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model.load_state_dict(torch.load('model.pth'))