In [1]:
!pip install transformers torch nltk textblob vaderSentiment scikit-learn pandas matplotlib





In [2]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from transformers import BertTokenizer

nltk.download('punkt')

# Load dataset
data = pd.read_csv('sentiment140.csv', encoding='ISO-8859-1', header=None)
data.columns = ['sentiment', 'id', 'date', 'flag', 'user', 'text']

# Map sentiment labels (0: Negative, 2: Neutral, 4: Positive)
data['sentiment'] = data['sentiment'].map({0: 0, 2: 1, 4: 2})

# Preprocessing (remove unnecessary columns, clean text)
data = data[['sentiment', 'text']].dropna()
data['text'] = data['text'].str.lower()

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nives\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
analyzer = SentimentIntensityAnalyzer()

def extract_lexicon_features(text):
    vader_scores = analyzer.polarity_scores(text)
    blob_score = TextBlob(text).sentiment.polarity
    return vader_scores['pos'], vader_scores['neg'], vader_scores['neu'], vader_scores['compound'], blob_score

# Apply feature extraction
lexicon_features = data['text'].apply(lambda x: pd.Series(extract_lexicon_features(x)))
lexicon_features.columns = ['pos', 'neg', 'neu', 'compound', 'blob']

# Combine lexicon features with the main dataset
data = pd.concat([data, lexicon_features], axis=1)

In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(texts, tokenizer):
    return tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['text'], data['sentiment'], test_size=0.2, random_state=42, stratify=data['sentiment']
)

train_encodings = tokenize_data(train_texts, tokenizer)
test_encodings = tokenize_data(test_texts, tokenizer)

In [5]:
import torch

class HybridSentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, lexicon_features, labels):
        self.encodings = encodings
        self.lexicon_features = lexicon_features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['lexicon_features'] = torch.tensor(self.lexicon_features.iloc[idx].values, dtype=torch.float32)
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = HybridSentimentDataset(
    train_encodings,
    data.iloc[train_texts.index][['pos', 'neg', 'neu', 'compound', 'blob']],
    train_labels.tolist()
)

test_dataset = HybridSentimentDataset(
    test_encodings,
    data.iloc[test_texts.index][['pos', 'neg', 'neu', 'compound', 'blob']],
    test_labels.tolist()
)

In [6]:
from transformers import BertModel
import torch.nn as nn

class HybridBERTClassifier(nn.Module):
    def __init__(self, num_labels=3):
        super(HybridBERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.lexicon_fc = nn.Linear(5, 16)  # Process lexicon features
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size + 16, num_labels)

    def forward(self, input_ids, attention_mask, lexicon_features):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_output.pooler_output
        lexicon_output = nn.ReLU()(self.lexicon_fc(lexicon_features))
        combined = torch.cat((pooled_output, lexicon_output), dim=1)
        dropout_output = self.dropout(combined)
        logits = self.classifier(dropout_output)
        return logits



In [7]:
from torch.utils.data import DataLoader
from transformers import AdamW, get_scheduler
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HybridBERTClassifier(num_labels=3).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

epochs = 3
for epoch in range(epochs):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        lexicon_features = batch['lexicon_features'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, lexicon_features=lexicon_features)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0:   2%|▏         | 1571/80000 [05:50<4:51:57,  4.48it/s, loss=0.484] 


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

model.eval()
predictions, true_labels = [], []

for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    lexicon_features = batch['lexicon_features'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, lexicon_features=lexicon_features)
    preds = torch.argmax(outputs, dim=-1).cpu().numpy()
    labels = labels.cpu().numpy()
    predictions.extend(preds)
    true_labels.extend(labels)

print("Accuracy:", accuracy_score(true_labels, predictions))
print("Precision:", precision_score(true_labels, predictions, average='weighted'))
print("Recall:", recall_score(true_labels, predictions, average='weighted'))
print("F1 Score:", f1_score(true_labels, predictions, average='weighted'))
print(classification_report(true_labels, predictions))


In [None]:
model.save_pretrained('./hybrid_sentiment_model')
tokenizer.save_pretrained('./hybrid_sentiment_model')
