In [4]:
# !pip install transformers

In [5]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
data = pd.read_csv("E:/Tweets.csv")
df = data[["text","airline_sentiment"]]
df['text'] = df['text'].map(lambda x: x.lstrip('@VirginAmerica@UnitedAir@Southwestairline@DeltaAir@USAirways@American').rstrip('@'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].map(lambda x: x.lstrip('@VirginAmerica@UnitedAir@Southwestairline@DeltaAir@USAirways@American').rstrip('@'))


In [7]:
df

Unnamed: 0,text,airline_sentiment
0,What @dhepburn said.,neutral
1,plus you've added commercials to the experien...,positive
2,I didn't today... Must mean I need to take an...,neutral
3,"it's really aggressive to blast obnoxious ""en...",negative
4,and it's a really big bad thing about it,negative
...,...,...
14635,thank you we got on a different flight to Chi...,positive
14636,leaving over 20 minutes Late Flight. No warni...,negative
14637,Please bring American Airlines to #BlackBerry10,neutral
14638,"you have my money, you change my flight, and ...",negative


In [8]:
text = data["text"]
labels = data["airline_sentiment"]
train_texts, test_texts, train_labels, test_labels = train_test_split(text, labels, test_size=0.2, random_state=42)

In [9]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 classes: positive, neutral, negative


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128, return_tensors='pt')


In [11]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder
label_encoder = LabelEncoder()

#  Fit the encoder to your string labels and transform them into numerical labels
train_labels_encoded = label_encoder.fit_transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)

# Create the TensorDataset with labels as torch.int64 data type
train_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    torch.tensor(train_labels_encoded, dtype=torch.int64)  # Use the encoded labels here
)

# Create the test dataset using TensorDataset
test_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask'],
    torch.tensor(test_labels_encoded, dtype=torch.int64)  # Use the encoded test labels here
)


In [12]:
# Create data loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Set up GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [16]:
# Define training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()


# # Convert train_labels and test_labels to Long data type
# train_labels = torch.tensor(train_labels_encoded, dtype=torch.long).to(device)
# test_labels = torch.tensor(test_labels_encoded, dtype=torch.long).to(device)




# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    predicted = torch.argmax(outputs.logits, dim=1)
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")


Epoch 1/3, Average Loss: 0.2945
Epoch 2/3, Average Loss: 0.1784
Epoch 3/3, Average Loss: 0.1174


In [19]:
# Evaluation on the test set
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, predicted = torch.max(logits, 1)
        
        predictions.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.8415


In [18]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc


# Calculate and print accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy:.4f}")

# Generate the confusion matrix
confusion_mtx = confusion_matrix(true_labels, predictions)
print("Confusion Matrix:")
print(confusion_mtx)

# Classification Report
class_names = ['negative', 'neutral', 'positive']  
classification_rep = classification_report(true_labels, predictions, target_names=class_names)
print("Classification Report:")
print(classification_rep)




Accuracy: 0.8415
Confusion Matrix:
[[1770   85   34]
 [ 187  333   60]
 [  57   41  361]]
Classification Report:
              precision    recall  f1-score   support

    negative       0.88      0.94      0.91      1889
     neutral       0.73      0.57      0.64       580
    positive       0.79      0.79      0.79       459

    accuracy                           0.84      2928
   macro avg       0.80      0.77      0.78      2928
weighted avg       0.84      0.84      0.84      2928



In [21]:
# 1. User input
user_input = "Meetings: Air crew is so bad. "

# 2. Tokenize and preprocess the user input
user_encodings = tokenizer(user_input, truncation=True, padding=True, max_length=128, return_tensors='pt').to(device)

# 3. Inference
model.eval()
with torch.no_grad():
    user_logits = model(user_encodings['input_ids'], attention_mask=user_encodings['attention_mask'])
    _, user_predicted = torch.max(user_logits.logits, 1)
    
# 4. Display the predicted sentiment
predicted_sentiment = label_encoder.inverse_transform([user_predicted.item()])[0]
print(f"Predicted Sentiment: {predicted_sentiment}")


Predicted Sentiment: negative
