In [16]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Load your dataset into a DataFrame (replace 'your_dataset.csv' with your actual dataset file)
df = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t')

# Split the dataset into train and validation sets (80% train, 20% validation)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, no_deprecation_warning=True)
num_epochs = 5

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for i, row in train_df.iterrows():
        review = row['Review']
        label = row['Liked']

        encoding = tokenizer(review, truncation=True, padding=True, return_tensors='pt')
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=torch.tensor([label]))
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_df)
    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {average_loss:.4f}')

# Evaluate the model on the validation set
model.eval()
val_labels = []
val_predictions = []

for i, row in val_df.iterrows():
    review = row['Review']
    label = row['Liked']

    encoding = tokenizer(review, truncation=True, padding=True, return_tensors='pt')
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)

    val_labels.append(label)
    val_predictions.append(predictions.item())

# Calculate evaluation metrics
accuracy = accuracy_score(val_labels, val_predictions)
classification_report_str = classification_report(val_labels, val_predictions)

print(f'Validation Accuracy: {accuracy:.4f}')
print(classification_report_str)

# Define a function to predict sentiment
def predict_sentiment(input_text, model, tokenizer):
    # Tokenize the input text
    encoding = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt')
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    # Pass the input through the model
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    # Get the predicted class (0 for negative, 1 for positive)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    # Determine the sentiment based on the predicted class
    if predicted_class == 0:
        return "Negative"
    else:
        return "Positive"

# Example usage:
new_input_text = "I really enjoyed the food at that restaurant."
sentiment = predict_sentiment(new_input_text, model, tokenizer)
print(f"Predicted Sentiment: {sentiment}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Train Loss: 0.3379
Epoch 2/5, Train Loss: 0.1293
Epoch 3/5, Train Loss: 0.0800
Epoch 4/5, Train Loss: 0.0250
Epoch 5/5, Train Loss: 0.0838
Validation Accuracy: 0.9150
              precision    recall  f1-score   support

           0       0.90      0.93      0.91        96
           1       0.93      0.90      0.92       104

    accuracy                           0.92       200
   macro avg       0.91      0.92      0.91       200
weighted avg       0.92      0.92      0.92       200

Predicted Sentiment: Positive


In [17]:

# Example usage:
new_input_text = "Worst restaurant ever"
sentiment = predict_sentiment(new_input_text, model, tokenizer)
print(f"Predicted Sentiment: {sentiment}")

Predicted Sentiment: Negative


In [18]:
# Example usage:
new_input_text = "Parking is very congested"
sentiment = predict_sentiment(new_input_text, model, tokenizer)
print(f"Predicted Sentiment: {sentiment}")

Predicted Sentiment: Positive


In [19]:
# Example usage:
new_input_text = "no parking place. We should park on the road itself"
sentiment = predict_sentiment(new_input_text, model, tokenizer)
print(f"Predicted Sentiment: {sentiment}")

Predicted Sentiment: Negative


In [20]:
# Example usage:
new_input_text = "Food is avearge and ambience is very dirty"
sentiment = predict_sentiment(new_input_text, model, tokenizer)
print(f"Predicted Sentiment: {sentiment}")

Predicted Sentiment: Positive


In [21]:
# Example usage:
new_input_text = "ambience is not good"
sentiment = predict_sentiment(new_input_text, model, tokenizer)
print(f"Predicted Sentiment: {sentiment}")

Predicted Sentiment: Negative


In [22]:
# Example usage:
new_input_text = "ambience is very bad"
sentiment = predict_sentiment(new_input_text, model, tokenizer)
print(f"Predicted Sentiment: {sentiment}")

Predicted Sentiment: Negative


In [23]:
# Example usage:
new_input_text = "space is very less to eat"
sentiment = predict_sentiment(new_input_text, model, tokenizer)
print(f"Predicted Sentiment: {sentiment}")

Predicted Sentiment: Negative


In [30]:
# Example usage:
new_input_text = "food is not bad"
sentiment = predict_sentiment(new_input_text, model, tokenizer)
print(f"Predicted Sentiment: {sentiment}")

Predicted Sentiment: Negative


In [31]:
new_input_text = "food is ok"
sentiment = predict_sentiment(new_input_text, model, tokenizer)
print(f"Predicted Sentiment: {sentiment}")

Predicted Sentiment: Positive


In [32]:
new_input_text = "food is too salty"
sentiment = predict_sentiment(new_input_text, model, tokenizer)
print(f"Predicted Sentiment: {sentiment}")

Predicted Sentiment: Positive


In [41]:
new_input_text = "did not loved the food very much"
sentiment = predict_sentiment(new_input_text, model, tokenizer)
print(f"Predicted Sentiment: {sentiment}")

Predicted Sentiment: Negative


In [25]:
# Example usage:
new_input_text = "not bad"
sentiment = predict_sentiment(new_input_text, model, tokenizer)
print(f"Predicted Sentiment: {sentiment}")

Predicted Sentiment: Negative


In [26]:
# Example usage:
new_input_text = "disgusting"
sentiment = predict_sentiment(new_input_text, model, tokenizer)
print(f"Predicted Sentiment: {sentiment}")

Predicted Sentiment: Negative


In [27]:
pip install transformers




In [28]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

df = pd.read_csv('/kaggle/input/reviews/Restaurant_Reviews.tsv', delimiter='\t')
print(df.columns)

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/reviews/Restaurant_Reviews.tsv'

In [None]:
df.columns


In [None]:
df.info()

In [None]:
df['Liked'].unique()

In [None]:
df['Liked'].value_counts()

In [None]:
# Split the dataset into train and validation sets (80% train, 20% validation)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Define a custom dataset class for BERT
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Initialize the BERT tokenizer and dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = SentimentDataset(train_df['Review'], train_df['Liked'], tokenizer)
val_dataset = SentimentDataset(val_df['Review'], val_df['Liked'], tokenizer)
train_dataset

In [None]:
val_dataset

In [None]:
# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
train_loader

In [None]:
for x,y in train_loader:
    optimizer.zerograd()
    
  

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, no_deprecation_warning=True)
num_epochs = 3

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {average_loss:.4f}')
