# Twitter US Airline Sentiment
Analyze how travelers in February 2015 expressed their feelings on Twitter: A sentiment analysis job about the problems of each major U.S. airline. Twitter data was scraped from February 2015 and contributors were asked to first classify **positive**, **negative**, and **neutral** tweets, followed by categorizing negative reasons (such as "late flight" or "rude service").

In [41]:
base_model = "rnn"
bidirectional = False
bool_rewrite = 0
pre_processing = 0
cuda_NO = 'cuda:0'
rand_seed = 18

# 部分超参数
num_epoch = 5
lr = 1e-3
dropout = 0
n_layers = 5
vocab_size = 20000

if bool_rewrite == 0:
    model_save_path = 'model/rnn.pth'
else:
    model_save_path = 'model/rnn-rewrite.pth'

!nvidia-smi
import torch
print(torch.cuda.is_available())
device = f"{cuda_NO}" if torch.cuda.is_available() else "cpu"
print(device)

Fri Oct 11 15:50:33 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.90                 Driver Version: 565.90         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4050 ...  WDDM  |   00000000:01:00.0  On |                  N/A |
| N/A   51C    P8              3W /   92W |     832MiB /   6141MiB |      2%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Set Random Seed

In [42]:
import numpy as np
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(rand_seed)

# Tokenization & Processing 

In [43]:
import re

def tokenizer(text):
    vectorizer = CountVectorizer(max_features=vocab_size) # initialize class CountVectorizer
    return vectorizer.fit_transform(text).toarray()

def preprocess_text(text):
    if not isinstance(text, str): return "" # Check if text is a string
    text = text.lower() # Lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
    text = re.sub(r'\@\w+|\#','', text) # Remove user @ references and '#' from hashtags
    text = re.sub(r'\W', ' ', text) # Remove special characters, numbers, and punctuations
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text) # Remove single characters
    text = re.sub(r'\s+', ' ', text) # Remove multiple spaces
    return text.strip()

# Read Data

In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from keras_preprocessing.sequence import pad_sequences

train_data = pd.read_csv('DS/Twitter-US-Airline-Sentiment/Tweets_rewrite.csv', sep=',', header=0)
if pre_processing:
    train_data['Clean-Text'] = train_data['Text'].apply(preprocess_text)
    train_data['Clean-Text-Rewrite'] = train_data['Text-Rewrite'].apply(preprocess_text)
else:
    train_data['Clean-Text'] = train_data['Text']
    train_data['Clean-Text-Rewrite'] = train_data['Text-Rewrite']
text_column = 'Clean-Text-Rewrite' if bool_rewrite else 'Clean-Text'

X = tokenizer(train_data[text_column])
y = train_data['Original_Label']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=rand_seed)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=rand_seed)

# post-padding
X_train = pad_sequences(X_train, maxlen=50, padding='post')
X_val = pad_sequences(X_val, maxlen=50, padding='post')
X_test = pad_sequences(X_test, maxlen=50, padding='post')

# map labels to integer
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

# Dataset and Dataloader

In [45]:
from torch.utils.data import DataLoader, Dataset

class Airline_Dataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {
            'text': torch.tensor(self.texts[idx], dtype=torch.long),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset = Airline_Dataset(X_train, y_train)
val_dataset = Airline_Dataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Model

In [46]:
import torch.nn as nn

class Sentiment(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout=dropout):
        super(Sentiment, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(dropout)
        
        if bidirectional:
            self.fc = nn.Linear(hidden_dim*2, output_dim)
        else:
            self.fc = nn.Linear(hidden_dim, output_dim)
        
        if base_model == "rnn":
            self.rnn = nn.RNN(embedding_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True)
        elif base_model == "lstm":
            self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True)
        elif base_model == "gru":
            self.rnn = nn.GRU(embedding_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True)
        else:
            raise ValueError("Invalid base model")
        
    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        _, hidden = self.rnn(embedded)

        if isinstance(hidden, tuple):  # recognize LSTM
            hidden = hidden[0]
        
        if bidirectional:
            hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        else:
            hidden = hidden[-1]
            
        output = self.fc(hidden)
        return output

# Training

In [47]:
import torch, gc
from tqdm import tqdm
import torch.optim as optim
gc.collect()
torch.cuda.empty_cache()

# Hyper
embedding_dim = 128
hidden_dim = 128
output_dim = len(label_encoder.classes_)
criterion = nn.CrossEntropyLoss()

model = Sentiment(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

total_steps = len(train_loader) * num_epoch
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_steps)

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epoch=num_epoch):
    model.to(device)
    best_accuracy = 0.0
    
    for epoch in range(num_epoch):
        model.train()
        running_loss = 0.0
        for batch in tqdm(train_loader):
            texts = batch['text'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            running_loss += loss.item()
        
        # Evaluation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in tqdm(val_loader):
                texts = batch['text'].to(device)
                labels = batch['label'].to(device)

                outputs = model(texts)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, pred = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (pred == labels).sum().item()
        val_accuracy = correct / total
        print(f'Epoch {epoch+1}, Train Loss: {running_loss/len(train_loader)}, Val Loss: {val_loss/len(val_loader)}')

        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), model_save_path)
            print(f'Saved best model with accuracy: {best_accuracy}')
            
train_model(model, train_loader, val_loader, criterion, optimizer, num_epoch)

100%|██████████| 366/366 [00:01<00:00, 242.36it/s]
100%|██████████| 46/46 [00:00<00:00, 1081.73it/s]


Epoch 1, Train Loss: 0.9842878734804893, Val Loss: 0.9331051979375922
Saved best model with accuracy: 0.6352459016393442


100%|██████████| 366/366 [00:01<00:00, 310.08it/s]
100%|██████████| 46/46 [00:00<00:00, 1012.25it/s]


Epoch 2, Train Loss: 0.9540028897790961, Val Loss: 0.9207716843356257


100%|██████████| 366/366 [00:01<00:00, 340.02it/s]
100%|██████████| 46/46 [00:00<00:00, 1078.65it/s]


Epoch 3, Train Loss: 0.9446497005843074, Val Loss: 0.9054968512576559


100%|██████████| 366/366 [00:01<00:00, 329.27it/s]
100%|██████████| 46/46 [00:00<00:00, 1021.31it/s]


Epoch 4, Train Loss: 0.9275026155299828, Val Loss: 0.9024962990180306


100%|██████████| 366/366 [00:01<00:00, 321.19it/s]
100%|██████████| 46/46 [00:00<00:00, 1012.32it/s]

Epoch 5, Train Loss: 0.9183930997314349, Val Loss: 0.9021276831626892





# Prediction

In [48]:
from sklearn.metrics import accuracy_score

model = Sentiment(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout)
model.load_state_dict(torch.load(model_save_path, weights_only=True))

def test_model(model, test_loader, criterion):
    model.eval()
    test_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            texts = batch['text']
            labels = batch['label']

            outputs = model(texts)
            loss = criterion(outputs, labels)
            test_loss += loss.item()

            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f'Test Loss: {test_loss / len(test_loader)}')
    print(f'Test Accuracy: {accuracy * 100:.9f}%')
    
test_dataset = Airline_Dataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=32)

test_model(model, test_loader, criterion)

Test Loss: 0.962230411560639
Test Accuracy: 61.843003413%
