In [None]:
pip install transformers datasets torch scikit-learn streamlit boto3

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting boto3
  Downloading boto3-1.36.7-py3-none-any.whl.metadata (6.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.0 MB/s[0m eta 

In [None]:
import pandas as pd

# Load the Twitter dataset
url = "https://raw.githubusercontent.com/GuviMentor88/Training-Datasets/refs/heads/main/twitter_training.csv"
df = pd.read_csv(url, encoding='ISO-8859-1', header=None)
df.columns = ['Tweet_ID', 'Entity', 'label', 'text']
print(df.head())

   Tweet_ID       Entity     label  \
0      2401  Borderlands  Positive   
1      2401  Borderlands  Positive   
2      2401  Borderlands  Positive   
3      2401  Borderlands  Positive   
4      2401  Borderlands  Positive   

                                                text  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  


In [None]:
df['label'].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [None]:
df['label'] = df['label'].replace({'Irrelevant':'Neutral'})

In [None]:
df['label'].unique()

array(['Positive', 'Neutral', 'Negative'], dtype=object)

In [None]:
label_mapping = {"Positive": 2, "Neutral": 1, "Negative": 0}
df['label'] = df['label'].map(label_mapping)

In [None]:
#Remove duplicates
df = df.drop_duplicates(subset=['text'], keep='first').reset_index(drop=True)
# Check for missing values
df = df.dropna(subset=['text', 'label']).reset_index(drop=True)


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Download necessary NLTK packages
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
# Download the 'punkt_tab' resource
nltk.download('punkt_tab') # This line downloads the missing 'punkt_tab' resource

from nltk.stem import WordNetLemmatizer

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Check if text is a string before applying lower()
    if isinstance(text, str):
        # Convert text to lowercase
        text = text.lower()
        # Remove URLs
        text = re.sub(r"http\S+|www\S+|https\S+", '', text)
        # Remove mentions and hashtags
        text = re.sub(r'@\w+|#', '', text)
        # Remove special characters and numbers, retain contractions
        text = re.sub(r"[^\w\s']", '', text)
        # Tokenize text
        words = word_tokenize(text)
        # Remove stopwords and lemmatize
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        # Join cleaned words back into a single string
        return " ".join(words)
    else:
        # Handle non-string values (e.g., NaN)
        return ""  # Or any other appropriate handling

# Apply the cleaning function to your dataset
df['Cleaned_Tweet_Content'] = df['text'].apply(clean_text)

# Display a sample of the cleaned tweets
print("\nSample Cleaned Tweets:")
print(df[['text', 'Cleaned_Tweet_Content']].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Sample Cleaned Tweets:
                                                text  \
0  im getting on borderlands and i will murder yo...   
1  I am coming to the borders and I will kill you...   
2  im getting on borderlands and i will kill you ...   
3  im coming on borderlands and i will murder you...   
4  im getting on borderlands 2 and i will murder ...   

            Cleaned_Tweet_Content  
0    im getting borderland murder  
1              coming border kill  
2      im getting borderland kill  
3     im coming borderland murder  
4  im getting borderland 2 murder  


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(example):
    return tokenizer(example['Cleaned_Tweet_Content'], padding='max_length', truncation=True, max_length=128)

from datasets import Dataset
dataset = Dataset.from_pandas(df[['Cleaned_Tweet_Content', 'label']])
tokenized_data = dataset.map(tokenize_data, batched=True)
tokenized_data = tokenized_data.train_test_split(test_size=0.2)


Map:   0%|          | 0/69491 [00:00<?, ? examples/s]

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}




In [None]:
# prompt: runtime to gpu

import torch

# Check for GPU availability
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available. Using CPU.")

GPU is available. Using GPU: Tesla T4


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4635,0.458276,0.824232,0.824078,0.825245,0.824232
2,0.1679,0.36329,0.880999,0.881664,0.88524,0.880999
3,0.1364,0.375858,0.903806,0.903856,0.903953,0.903806


TrainOutput(global_step=10425, training_loss=0.3447264878755565, metrics={'train_runtime': 4149.5403, 'train_samples_per_second': 40.191, 'train_steps_per_second': 2.512, 'total_flos': 1.0970250838751232e+16, 'train_loss': 0.3447264878755565, 'epoch': 3.0})

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the fine-tuned model and tokenizer
model_name = "/content/drive/My Drive/BERT_Finetuned_model"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Ensure the model is in evaluation mode
model.eval()

# Prediction using the model and tokenizer
def predict(text):
    # Tokenize and encode input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract logits and convert to probabilities
    logits = outputs.logits
    probabilities_tensor = torch.softmax(logits, dim=1).squeeze()

    # Define class mapping
    class_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}

    # Get the predicted class
    predicted_class = torch.argmax(probabilities_tensor).item()

    # Convert probabilities tensor to list and format the output
    probabilities_list = probabilities_tensor.tolist()
    formatted_probabilities = ", ".join(
        [f"{class_mapping[i]}: {prob:.4f}" for i, prob in enumerate(probabilities_list)]
    )

    return class_mapping[predicted_class], formatted_probabilities

# Test prediction by calling the predict function
sample_text = "I absolutely love this product! Highly recommend it."
predicted_label, probabilities = predict(sample_text)

# Output
print(f"Predicted Sentiment: {predicted_label}")
print(f"Probabilities: {probabilities}")


Predicted Sentiment: Positive
Probabilities: Negative: 0.0009, Neutral: 0.0002, Positive: 0.9988


In [None]:
from google.colab import drive
drive.mount('/content/drive')

save_directory = "/content/drive/My Drive/BERT_Finetuned_model"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

Mounted at /content/drive


('/content/drive/My Drive/BERT_Finetuned_model/tokenizer_config.json',
 '/content/drive/My Drive/BERT_Finetuned_model/special_tokens_map.json',
 '/content/drive/My Drive/BERT_Finetuned_model/vocab.txt',
 '/content/drive/My Drive/BERT_Finetuned_model/added_tokens.json')