In [1]:
#install necessary libraries
!pip install transformers datasets pandas scikit-learn

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [2]:
from IPython import get_ipython
from IPython.display import display
import pandas as pd
import zipfile
from io import BytesIO
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
from transformers import DataCollatorWithPadding

In [7]:

# Path to the ZIP file
zip_file_path = r"/content/twitter_training.csv (1).zip"

# Open the ZIP file and read the CSV file inside
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    csv_file_name = zip_ref.namelist()[0]
    with zip_ref.open(csv_file_name) as file:
        df = pd.read_csv(file, header = None, names = ['tweet_id', 'entity', 'sentiment', 'tweet_content'])



# Display the first few rows of the dataset
print(df.head())

   tweet_id       entity sentiment  \
0      2401  Borderlands  Positive   
1      2401  Borderlands  Positive   
2      2401  Borderlands  Positive   
3      2401  Borderlands  Positive   
4      2401  Borderlands  Positive   

                                       tweet_content  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  


In [8]:
df['tweet_content'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['tweet_content'].fillna('', inplace=True)


In [9]:
# Inspecting label encoder, by counting how many unique values there are in each column
print(df['sentiment'].value_counts())

# Remove rows where sentiment is 'irrelevant'
df = df[df['sentiment'] != 'Irrelevant']

#Check to see if it worked
print(df['sentiment'].value_counts())

sentiment
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64
sentiment
Negative    22542
Positive    20832
Neutral     18318
Name: count, dtype: int64


In [10]:
label_encoder = LabelEncoder()
df['sentiment_encoded'] = label_encoder.fit_transform(df['sentiment'])

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:

# Preprocess text data
def preprocess_data(text):
    return tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')

# Apply preprocessing to the 'text' column
inputs = df['tweet_content'].apply(preprocess_data)

# Extract the labels
labels = df['sentiment_encoded'].values

In [12]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.2, random_state=42)

class SentimentAnalysisDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
       # Get the sample from the pandas series at the index idx
        sample = self.inputs.iloc[idx]

        # Create the dictionary for the data
        item = {}
        item['input_ids'] = sample['input_ids'].squeeze(0) # Remove the extra dimension
        item['attention_mask'] = sample['attention_mask'].squeeze(0) # Remove the extra dimension
        item['labels'] = torch.tensor(self.labels[idx]) # Create the tensor for the labels
        return item


In [13]:
# Create the dataset and DataLoader
train_dataset = SentimentAnalysisDataset(X_train, y_train)
test_dataset = SentimentAnalysisDataset(X_test, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [15]:
# Load BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [16]:
# Training setup
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,               # Number of training epochs
    per_device_train_batch_size=16,   # Batch size per device during training
    per_device_eval_batch_size=64,    # Batch size per device during evaluation
    warmup_steps=500,                 # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                # Strength of weight decay
    logging_dir='./logs',             # Directory for storing logs
    logging_steps=10,                 # Log every X updates
)
# Create the data_collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,                          # The pre-trained BERT model
    args=training_args,                   # Training arguments
    train_dataset=train_dataset,          # Training dataset
    eval_dataset=test_dataset,
    data_collator=data_collator # Evaluation dataset
)



In [17]:
# Training the model
trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhiranyaindrakanti[0m ([33mhiranyaindrakanti-scsvmv[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,1.0976
20,1.1087
30,1.105
40,1.0774
50,1.0787
60,1.0767
70,1.0846
80,1.0729
90,1.0685
100,1.0399


TrainOutput(global_step=9255, training_loss=0.29827572249202133, metrics={'train_runtime': 2326.3258, 'train_samples_per_second': 63.645, 'train_steps_per_second': 3.978, 'total_flos': 5709574525925844.0, 'train_loss': 0.29827572249202133, 'epoch': 3.0})

In [18]:
# Evaluation
predictions, true_labels, _ = trainer.predict(test_dataset)


In [20]:
import numpy as np
# Convert logits to predicted labels
pred_labels = np.argmax(predictions, axis=1)


In [22]:
# Evaluate model performance
accuracy = accuracy_score(true_labels, pred_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


Accuracy: 0.9323
Precision: 0.9329
Recall: 0.9323
F1-score: 0.9324


In [23]:
# Save the model
model.save_pretrained('./sentiment_analysis_model')
tokenizer.save_pretrained('./sentiment_analysis_model')


('./sentiment_analysis_model/tokenizer_config.json',
 './sentiment_analysis_model/special_tokens_map.json',
 './sentiment_analysis_model/vocab.txt',
 './sentiment_analysis_model/added_tokens.json')

In [25]:
# Inference example
from transformers import pipeline
nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
result = nlp("I love this product! It's amazing!")
print(result)

Device set to use cuda:0


[{'label': 'LABEL_2', 'score': 0.9994962215423584}]
