In [44]:
import sqlite3
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split 
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from torch.optim import AdamW
import numpy as np
from scipy.special import softmax
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import torch
database_path =  r"C:\Users\formy\Downloads\database.sqlite\database.sqlite"
conn = sqlite3.connect(database_path)
c = conn.cursor()
c.execute('SELECT * FROM Tweets')
data = c.fetchall()
columns = ['id', 'airline_sentiment', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'airline', 'airline_sentiment_gold',
           'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone']
df = pd.DataFrame(data, columns=columns)
X = df['text'].values.tolist()
y = df['airline_sentiment'].values.tolist()
# Create a dictionary that maps each class name to its corresponding integer value
label_map = {'negative': 0, 'positive': 2, 'neutral': 1}

# Convert the labels to their integer representations
y = [label_map[i] for i in y]

X_train,X_test,y_train,y_test =  train_test_split(X,y,test_size = 0.2)


# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Save the model
save_directory = f"pretrained_models/cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(save_directory)
config = AutoConfig.from_pretrained(save_directory)
model = AutoModelForSequenceClassification.from_pretrained(save_directory, config=config)

#### Text to tensor

In [45]:
for name, child in model.named_children():
    print(name)

roberta
classifier


In [46]:
for i, layer in enumerate(model.roberta.encoder.layer):
    print(f"Layer {i}:")
    print(f"  Attention: {layer.attention}")
    print(f"  Intermediate: {layer.intermediate}")
    print(f"  Output: {layer.output}")

Layer 0:
  Attention: RobertaAttention(
  (self): RobertaSelfAttention(
    (query): Linear(in_features=768, out_features=768, bias=True)
    (key): Linear(in_features=768, out_features=768, bias=True)
    (value): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (output): RobertaSelfOutput(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)
  Intermediate: RobertaIntermediate(
  (dense): Linear(in_features=768, out_features=3072, bias=True)
  (intermediate_act_fn): GELUActivation()
)
  Output: RobertaOutput(
  (dense): Linear(in_features=3072, out_features=768, bias=True)
  (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)
Layer 1:
  Attention: RobertaAttention(
  (self): RobertaSelfAttention(
    (query): Linear(in_features=768, out_f

In [47]:
num_labels, hidden_size = model.classifier.out_proj.weight.shape
print(f'num_label:{num_labels} \nhidden_size:{hidden_size} ')

num_label:3 
hidden_size:768 


In [48]:
text = "Covid cases are increasing fast!"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)
# text = "Covid cases are increasing fast!"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)
# Print labels and scores
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

1) negative 0.7236
2) neutral 0.2287
3) positive 0.0477


In [49]:
model = model.to('cpu')
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

def Predict(text):
    # preprocess the text
    text = preprocess(text)

    # encode the text
    encoded_input = tokenizer(text, return_tensors='pt')

    #encoded_input = {name: tensor.to(device) for name, tensor in encoded_input.items()}

    # perform sentiment classification
    output = model(**encoded_input)
    scores = output[0][0].detach().cpu().numpy()
    scores = softmax(scores)

    # get the predicted sentiment label
    ranking = np.argsort(scores)[::-1]
    label = config.id2label[ranking[0]]

    return label


# example usage
result = Predict('American Airline is great')
print(result) # positive

positive


#### Transfer Learning

In [50]:
 class CustomClassificationHead(nn.Module):
    def __init__(self, hidden_size, num_labels):
        super().__init__()
        self.dense = nn.Linear(hidden_size, num_labels)

    def forward(self, hidden_states):
        # Apply mean pooling to the hidden states
        pooled_output = hidden_states.mean(dim=1)
        # Apply a linear transformation to the pooled hidden states
        logits = self.dense(pooled_output)
        return logits

# Replace the classifier with your custom classification head
model.classifier = CustomClassificationHead(hidden_size=768, num_labels=3)


In [51]:
# Freeze old layers
for param in model.parameters():
    param.requires_grad = False
for param in model.classifier.parameters():
    param.requires_grad = True

In [52]:
# Define a dataset
class AirlineSentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = preprocess(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


    # Initialize your dataset and dataloader
dataset = AirlineSentimentDataset(X_train, y_train, tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)  # You can change the batch_size depending on your GPU memory

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)
device = 'cuda'
# Training loop
for epoch in range(1):  # You can change the number of epochs
    print(f'Starting epoch {epoch+1}')
    for i, batch in enumerate(dataloader):
        #print(f'Starting batch {i+1}')
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        #print(f'End batch {i+1}')
    print(f'Epoch {epoch+1}/{100} | Loss: {loss.item()}')

Starting epoch 1
Starting batch 1
End batch 1
Starting batch 2
End batch 2
Starting batch 3
End batch 3
Starting batch 4
End batch 4
Starting batch 5
End batch 5
Starting batch 6
End batch 6
Starting batch 7
End batch 7
Starting batch 8
End batch 8
Starting batch 9
End batch 9
Starting batch 10
End batch 10
Starting batch 11
End batch 11
Starting batch 12
End batch 12
Starting batch 13
End batch 13
Starting batch 14
End batch 14
Starting batch 15
End batch 15
Starting batch 16
End batch 16
Starting batch 17
End batch 17
Starting batch 18
End batch 18
Starting batch 19
End batch 19
Starting batch 20
End batch 20
Starting batch 21
End batch 21
Starting batch 22
End batch 22
Starting batch 23
End batch 23
Starting batch 24
End batch 24
Starting batch 25
End batch 25
Starting batch 26
End batch 26
Starting batch 27
End batch 27
Starting batch 28
End batch 28
Starting batch 29
End batch 29
Starting batch 30
End batch 30
Starting batch 31
End batch 31
Starting batch 32
End batch 32
Starting 

End batch 255
Starting batch 256
End batch 256
Starting batch 257
End batch 257
Starting batch 258
End batch 258
Starting batch 259
End batch 259
Starting batch 260
End batch 260
Starting batch 261
End batch 261
Starting batch 262
End batch 262
Starting batch 263
End batch 263
Starting batch 264
End batch 264
Starting batch 265
End batch 265
Starting batch 266
End batch 266
Starting batch 267
End batch 267
Starting batch 268
End batch 268
Starting batch 269
End batch 269
Starting batch 270
End batch 270
Starting batch 271
End batch 271
Starting batch 272
End batch 272
Starting batch 273
End batch 273
Starting batch 274
End batch 274
Starting batch 275
End batch 275
Starting batch 276
End batch 276
Starting batch 277
End batch 277
Starting batch 278
End batch 278
Starting batch 279
End batch 279
Starting batch 280
End batch 280
Starting batch 281
End batch 281
Starting batch 282
End batch 282
Starting batch 283
End batch 283
Starting batch 284
End batch 284
Starting batch 285
End batch 

In [None]:
dataiter = iter(dataloader)
batch = next(dataiter)
inputs = batch['input_ids']
attention_mask = batch['attention_mask']
labels = batch['labels']

In [None]:
print(inputs.shape)
print(attention_mask.shape)
print(labels.shape)

In [66]:
model.to('cpu')
text = preprocess('American Airlines is super great')
# encode the text
encoded_input = tokenizer(text, return_tensors='pt')
#encoded_input = {name: tensor.to(device) for name, tensor in encoded_input.items()}
# perform sentiment classification
output = model(**encoded_input)
scores = output[0][0].detach().cpu().numpy()
scores = softmax(scores)
# get the predicted sentiment label
ranking = np.argsort(scores)[::-1]
label = config.id2label[ranking[0]]

In [68]:
scores

array([0.2980683 , 0.40511227, 0.29681948], dtype=float32)