In [14]:
pip install transformers



In [15]:
pip install sentencepiece --user



In [16]:
import pandas as pd
import torch
from transformers import XLNetForSequenceClassification, XLNetTokenizer
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, TensorDataset

# Load the test dataset
test_data = pd.read_csv('/content/test.csv')

# Create the XLNet model and tokenizer
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased')
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

# Clean the data
def clean_data(data):
    data['tweet'] = data['tweet'].str.replace('@', '')
    data['tweet'] = data['tweet'].str.replace('#', '')
    data['tweet'] = data['tweet'].str.replace('RT', '')
    data['tweet'] = data['tweet'].str.lower()
    return data

test_data = clean_data(test_data)

# Tokenize the data
test_encodings = tokenizer(list(test_data['tweet']), truncation=True, padding=True, return_tensors='pt')

# Create the DataLoader
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'])
batch_size = 8  # You can adjust this value according to your memory constraints
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Predict the sentiment
model.eval()
predicted_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predicted_batch_labels = torch.argmax(outputs.logits, dim=1)
        predicted_labels.extend(predicted_batch_labels.cpu().numpy())

# Add the predicted labels to the test data
test_data['label'] = predicted_labels

# Save the results to a file
test_data.to_csv('/content/results.csv', index=False)


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [17]:
import pandas as pd
from sklearn.metrics import accuracy_score

# Load the results with predicted labels
results_data = pd.read_csv('/content/results.csv')

# Load the actual labels from someone else's results
actual_data = pd.read_csv('/content/actual.csv')

# Extract actual and predicted labels
actual_labels = actual_data['label']
predicted_labels = results_data['label']

# Calculate accuracy
accuracy = accuracy_score(actual_labels, predicted_labels)

print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.65
