In [None]:
!pip install transformers datasets torch scikit-learn streamlit boto3 contractions emoji

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting streamlit
  Downloading streamlit-1.42.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting boto3
  Downloading boto3-1.36.16-py3-none-any.whl.metadata (6.7 kB)
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torc

In [None]:
import pandas as pd

# Load the Twitter dataset
url = "https://raw.githubusercontent.com/GuviMentor88/Training-Datasets/refs/heads/main/twitter_training.csv"
df = pd.read_csv(url, encoding='ISO-8859-1', header=None)
df.columns = ['Tweet_ID', 'Entity', 'label', 'text']
print(df.head())

   Tweet_ID       Entity     label  \
0      2401  Borderlands  Positive   
1      2401  Borderlands  Positive   
2      2401  Borderlands  Positive   
3      2401  Borderlands  Positive   
4      2401  Borderlands  Positive   

                                                text  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  


In [None]:
df['label'] = df['label'].replace({'Irrelevant':'Neutral'})

In [None]:
label_mapping = {"Positive": 2, "Neutral": 1, "Negative": 0}
df['label'] = df['label'].map(label_mapping)

In [None]:
#Remove duplicates
df = df.drop_duplicates(subset=['text'], keep='first').reset_index(drop=True)
# Check for missing values
df = df.dropna(subset=['text', 'label']).reset_index(drop=True)

In [None]:
import re
import emoji
from contractions import fix

def clean_text(text):
  # Check if text is a string before applying lower()
  if isinstance(text, str):
    text = text.lower()  # Lowercase (for uncased models)
    text = fix(text)  # Expand contractions (e.g., "can't" → "cannot")
    text = emoji.demojize(text, delimiters=(" ", " "))  # Convert emojis to text (e.g., "😊" → ":smiling_face_with_smiling_eyes:")
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces & newlines
    return text
  else:
    # Handle non-string values (e.g., NaN)
    return ""

# Apply the cleaning function to your dataset
df['Cleaned_Tweet_Content'] = df['text'].apply(clean_text)

# Display a sample of the cleaned tweets
print("\nSample Cleaned Tweets:")
print(df[['text', 'Cleaned_Tweet_Content']].head())


Sample Cleaned Tweets:
                                                text  \
0  im getting on borderlands and i will murder yo...   
1  I am coming to the borders and I will kill you...   
2  im getting on borderlands and i will kill you ...   
3  im coming on borderlands and i will murder you...   
4  im getting on borderlands 2 and i will murder ...   

                               Cleaned_Tweet_Content  
0  i am getting on borderlands and i will murder ...  
1  i am coming to the borders and i will kill you...  
2  i am getting on borderlands and i will kill yo...  
3  i am coming on borderlands and i will murder y...  
4  i am getting on borderlands 2 and i will murde...  


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(example):
    return tokenizer(example['Cleaned_Tweet_Content'], padding='max_length', truncation=True, max_length=128)

from datasets import Dataset
dataset = Dataset.from_pandas(df[['Cleaned_Tweet_Content', 'label']])
tokenized_data = dataset.map(tokenize_data, batched=True)
tokenized_data = tokenized_data.train_test_split(test_size=0.2)

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/69491 [00:00<?, ? examples/s]

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    save_total_limit=3,
)

def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}



In [None]:
import torch

# Check for GPU availability
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available. Using CPU.")

GPU is available. Using GPU: Tesla T4


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpraveen22[0m ([33mpraveen22-guvi-geek-networks[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4396,0.37605,0.872797,0.872792,0.873991,0.872797
2,0.1354,0.349221,0.925318,0.92516,0.925796,0.925318
3,0.1412,0.344986,0.941722,0.941642,0.942216,0.941722
4,0.0876,0.308893,0.950788,0.950761,0.950811,0.950788
5,0.0001,0.334934,0.954313,0.954284,0.954328,0.954313


TrainOutput(global_step=34745, training_loss=0.21056119640739707, metrics={'train_runtime': 7494.6124, 'train_samples_per_second': 37.088, 'train_steps_per_second': 4.636, 'total_flos': 1.828375139791872e+16, 'train_loss': 0.21056119640739707, 'epoch': 5.0})

In [None]:
# Evaluate model on the test dataset
metrics = trainer.evaluate(tokenized_data['test'])

# Print evaluation results (Loss, Accuracy, F1-score, etc.)
print(metrics)


{'eval_loss': 0.3088926374912262, 'eval_accuracy': 0.9507878264623354, 'eval_f1': 0.950761245761352, 'eval_precision': 0.9508109646380692, 'eval_recall': 0.9507878264623354, 'eval_runtime': 99.7382, 'eval_samples_per_second': 139.355, 'eval_steps_per_second': 8.713, 'epoch': 5.0}


In [None]:
from google.colab import drive
drive.mount('/content/drive')

save_directory = "/content/drive/My Drive/BERT_Finetuned_model"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

Mounted at /content/drive


('/content/drive/My Drive/BERT_Finetuned_model/tokenizer_config.json',
 '/content/drive/My Drive/BERT_Finetuned_model/special_tokens_map.json',
 '/content/drive/My Drive/BERT_Finetuned_model/vocab.txt',
 '/content/drive/My Drive/BERT_Finetuned_model/added_tokens.json')

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the fine-tuned model and tokenizer
model_name = "/content/drive/My Drive/BERT_Finetuned_model"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Ensure the model is in evaluation mode
model.eval()

# Prediction function
def predict(text):
    # Tokenize and encode input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract logits and convert to probabilities
    logits = outputs.logits
    probabilities_tensor = torch.softmax(logits, dim=1).squeeze()

    # Define class mapping
    class_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}

    # Get the predicted class
    predicted_class = torch.argmax(probabilities_tensor).item()

    # Convert probabilities tensor to list and format the output
    probabilities_list = probabilities_tensor.tolist()
    formatted_probabilities = ", ".join(
        [f"{class_mapping[i]}: {prob:.4f}" for i, prob in enumerate(probabilities_list)]
    )

    return class_mapping[predicted_class], formatted_probabilities

# Test the model with multiple sentences
test_sentences = [
    "I just got promoted at work! Feeling so grateful today! 🙌",  # Positive
    "Worst customer service ever. I’m never shopping here again. 👎",  # Negative
    "Had a cup of coffee this morning. It was okay, nothing special. ☕",  # Neutral
    "The concert was amazing, but the ticket prices were ridiculous!",  # Mixed
]

# Loop over test sentences and predict
for sentence in test_sentences:
    predicted_label, probabilities = predict(sentence)
    print(f"Tweet: {sentence}")
    print(f"Predicted Sentiment: {predicted_label}")
    print(f"Probabilities: {probabilities}")
    print("-" * 50)  # Separator for readability


Tweet: I just got promoted at work! Feeling so grateful today! 🙌
Predicted Sentiment: Positive
Probabilities: Negative: 0.0000, Neutral: 0.0010, Positive: 0.9990
--------------------------------------------------
Tweet: Worst customer service ever. I’m never shopping here again. 👎
Predicted Sentiment: Positive
Probabilities: Negative: 0.0001, Neutral: 0.0001, Positive: 0.9998
--------------------------------------------------
Tweet: Had a cup of coffee this morning. It was okay, nothing special. ☕
Predicted Sentiment: Neutral
Probabilities: Negative: 0.0003, Neutral: 0.9946, Positive: 0.0051
--------------------------------------------------
Tweet: The concert was amazing, but the ticket prices were ridiculous!
Predicted Sentiment: Neutral
Probabilities: Negative: 0.0008, Neutral: 0.9990, Positive: 0.0002
--------------------------------------------------


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the fine-tuned model and tokenizer
model_name = "/content/drive/My Drive/BERT_Finetuned_model"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Ensure the model is in evaluation mode
model.eval()

# Prediction function
def predict(text):
    # Tokenize and encode input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract logits and convert to probabilities
    logits = outputs.logits
    probabilities_tensor = torch.softmax(logits, dim=1).squeeze()

    # Define class mapping
    class_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}

    # Get the predicted class
    predicted_class = torch.argmax(probabilities_tensor).item()

    # Convert probabilities tensor to list and format the output
    probabilities_list = probabilities_tensor.tolist()
    formatted_probabilities = ", ".join(
        [f"{class_mapping[i]}: {prob:.4f}" for i, prob in enumerate(probabilities_list)]
    )

    return class_mapping[predicted_class], formatted_probabilities

# Test the model with multiple sentences
test_sentences = [
   "Just finished reading an interesting article on AI trends.",
"Had a cup of coffee this morning. It was okay, nothing special. ☕",
"Attending a tech conference today. Let's see how it goes.",
"The store was busy, but I managed to find what I needed.",
"Watched a documentary last night. Pretty informative."
]

# Loop over test sentences and predict
for sentence in test_sentences:
    predicted_label, probabilities = predict(sentence)
    print(f"Tweet: {sentence}")
    print(f"Predicted Sentiment: {predicted_label}")
    print(f"Probabilities: {probabilities}")
    print("-" * 50)  # Separator for readability


Tweet: Just finished reading an interesting article on AI trends.
Predicted Sentiment: Neutral
Probabilities: Negative: 0.0000, Neutral: 0.9999, Positive: 0.0000
--------------------------------------------------
Tweet: Had a cup of coffee this morning. It was okay, nothing special. ☕
Predicted Sentiment: Neutral
Probabilities: Negative: 0.0003, Neutral: 0.9946, Positive: 0.0051
--------------------------------------------------
Tweet: Attending a tech conference today. Let's see how it goes.
Predicted Sentiment: Neutral
Probabilities: Negative: 0.0001, Neutral: 0.9999, Positive: 0.0001
--------------------------------------------------
Tweet: The store was busy, but I managed to find what I needed.
Predicted Sentiment: Neutral
Probabilities: Negative: 0.0084, Neutral: 0.9848, Positive: 0.0068
--------------------------------------------------
Tweet: Watched a documentary last night. Pretty informative.
Predicted Sentiment: Neutral
Probabilities: Negative: 0.0002, Neutral: 0.9995, Posi