<a href="https://colab.research.google.com/github/SilviaDragan/2D-ShooterGame/blob/master/ssXLNETsa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Semi Supervised Sentiment Analisys with XLNET

### Setup and Data Preparation



In [None]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json

# Load the dataset
file_path = '/content/updated_twitter_sa_1400_cleaned.json'
with open(file_path, 'r') as json_file:
    dataset = json.load(json_file)

# Splitting the data into labeled and unlabeled
labeled_data = [item for item in dataset if item['sentiment'] != 'unknown']
unlabeled_data = [item for item in dataset if item['sentiment'] == 'unknown']

# Extract texts and sentiments from the labeled data
labeled_texts = [item['text'] for item in labeled_data]
labeled_sentiments = [item['sentiment'] for item in labeled_data]

# Initialize the tokenizer for XLNet
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")

class EmotionDataset(Dataset):
    def __init__(self, texts, emotions=None, tokenizer=None, max_length=256):
        self.texts = texts
        self.emotions = emotions
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        inputs = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        if self.emotions is not None:
            label = emotion_to_label.get(self.emotions[idx], -1)
            inputs['labels'] = torch.tensor(label, dtype=torch.long)
        return inputs

emotion_to_label = {'sadness': 0, 'anger': 1, 'amusement': 2, 'surprise': 3, 'fear': 4, 'happiness': 5, 'other': 6}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=len(emotion_to_label))
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Training on Labeled Data

In [None]:
# Split labeled data into training, validation, and test sets
texts_train, texts_temp, emotions_train, emotions_temp = train_test_split(labeled_texts, labeled_sentiments, test_size=0.1, random_state=42)
texts_val, texts_test, emotions_val, emotions_test = train_test_split(texts_temp, emotions_temp, test_size=0.5, random_state=42)

# Create DataLoaders for the labeled training, validation, and test sets
train_dataset = EmotionDataset(texts_train, emotions_train, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

val_dataset = EmotionDataset(texts_val, emotions_val, tokenizer)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)

test_dataset = EmotionDataset(texts_test, emotions_test, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Initialize XLNet model for sequence classification
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=len(emotion_to_label))
model = model.to(device)

# Define the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_dataloader, desc=f'Training Epoch {epoch + 1}/{num_epochs}'):
        optimizer.zero_grad()
        inputs = {key: val.to(device) for key, val in batch.items()}

        outputs = model(**inputs)
        loss = outputs.loss

        loss.backward()
        optimizer.step()


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1/3: 100%|██████████| 315/315 [01:25<00:00,  3.67it/s]
Training Epoch 2/3: 100%|██████████| 315/315 [01:27<00:00,  3.60it/s]
Training Epoch 3/3: 100%|██████████| 315/315 [01:27<00:00,  3.60it/s]


### Model Evaluation

In [None]:
model.eval()
correct_predictions = 0
total_samples = 0

with torch.no_grad():
    for val_batch in tqdm(val_dataloader, desc='Validation'):
        val_inputs = {key: val.to(device) for key, val in val_batch.items()}
        val_outputs = model(**val_inputs)
        val_logits = val_outputs.logits
        val_predictions = torch.argmax(val_logits, dim=1)
        val_labels = val_inputs['labels']

        correct_predictions += (val_predictions == val_labels).sum().item()
        total_samples += val_labels.size(0)

accuracy = correct_predictions / total_samples
print(f'Validation Accuracy: {accuracy}')


Validation: 100%|██████████| 18/18 [00:01<00:00,  9.32it/s]

Validation Accuracy: 0.6571428571428571





### Pseudo-label Generation and Retraining on Combined Dataset

In [None]:
labeled_texts = [item['text'] for item in labeled_data]
unlabeled_texts = [item['text'] for item in unlabeled_data]

unlabeled_dataset = EmotionDataset(unlabeled_texts, tokenizer=tokenizer)
unlabeled_dataloader = DataLoader(unlabeled_dataset, batch_size=4, shuffle=False)

# Generating pseudo-labels for unlabeled data
pseudo_labels = []
model.eval()
with torch.no_grad():
    for batch in tqdm(unlabeled_dataloader, desc='Generating Pseudo-Labels'):
        inputs = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**inputs)
        pseudo_label_batch = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        pseudo_labels.extend(pseudo_label_batch)

# Combining labeled and pseudo-labeled data
combined_texts = texts_train + unlabeled_texts
label_to_emotion = {v: k for k, v in emotion_to_label.items()}
combined_emotions = emotions_train + [label_to_emotion[label] for label in pseudo_labels]

# Creating new train dataset with combined data
combined_train_dataset = EmotionDataset(combined_texts, combined_emotions, tokenizer)
combined_train_dataloader = DataLoader(combined_train_dataset, batch_size=4, shuffle=True)

for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(combined_train_dataloader, desc=f'Retraining Epoch {epoch + 1}/{num_epochs}'):
        optimizer.zero_grad()
        inputs = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


Generating Pseudo-Labels: 100%|██████████| 1150/1150 [02:02<00:00,  9.36it/s]
Retraining Epoch 1/3: 100%|██████████| 1465/1465 [06:46<00:00,  3.60it/s]
Retraining Epoch 2/3: 100%|██████████| 1465/1465 [06:46<00:00,  3.60it/s]
Retraining Epoch 3/3: 100%|██████████| 1465/1465 [06:46<00:00,  3.60it/s]


### Evaluation on Test Set

In [None]:
model.eval()
correct_predictions_test = 0
total_samples_test = 0

with torch.no_grad():
    for test_batch in tqdm(test_dataloader, desc='Testing'):
        test_inputs = {key: val.to(device) for key, val in test_batch.items()}
        test_outputs = model(**test_inputs)
        test_logits = test_outputs.logits
        test_predictions = torch.argmax(test_logits, dim=1)
        test_labels = test_inputs['labels']

        correct_predictions_test += (test_predictions == test_labels).sum().item()
        total_samples_test += test_labels.size(0)

test_accuracy = correct_predictions_test / total_samples_test
print(f'Test Accuracy: {test_accuracy}')


Testing: 100%|██████████| 18/18 [00:01<00:00, 10.04it/s]

Test Accuracy: 0.6285714285714286



