In [None]:
# 1. Install and Import Dependencies
!pip install torch transformers pandas numpy scikit-learn

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 2. Load and Prepare a Subset of the Dataset
df = pd.read_csv('train.csv')

# Check for class balance
print(df['sentiment'].value_counts())  # To check if the dataset is balanced

# Fixing label mismatch: ensure the labels match the dataset ('pos', 'neg')
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'pos' else 0)  # 1 for positive, 0 for negative

# Use only 20% of the dataset for faster training
df_subset = df.sample(frac=0.2, random_state=42)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_subset['text'].tolist(), df_subset['sentiment'].tolist(), test_size=0.2, random_state=42
)

# 3. Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=512)

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

# Convert to Torch Datasets
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

# 4. Model Initialization
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# 5. Training Configuration
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)

# 6. Metrics Calculation
def compute_metrics(p):
    predictions, labels = p
    preds = predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# 7. Fine-Tuning
trainer.train()

# 8. Save the Model
model.save_pretrained('./finetuned_model')
tokenizer.save_pretrained('./finetuned_model')

sentiment
neg    2531
pos    2469
Name: count, dtype: int64


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.31757,0.89,0.882979,0.873684,0.892473
2,No log,0.472134,0.91,0.90625,0.878788,0.935484
3,0.334200,0.532273,0.9,0.892473,0.892473,0.892473


('./finetuned_model/tokenizer_config.json',
 './finetuned_model/special_tokens_map.json',
 './finetuned_model/vocab.txt',
 './finetuned_model/added_tokens.json')

In [None]:
# If using Google Colab and want to save to Drive:
from google.colab import drive
drive.mount('/content/drive')
model.save_pretrained('/content/drive/MyDrive/finetuned_model')
tokenizer.save_pretrained('/content/drive/MyDrive/finetuned_model')



Mounted at /content/drive


('/content/drive/MyDrive/finetuned_model/tokenizer_config.json',
 '/content/drive/MyDrive/finetuned_model/special_tokens_map.json',
 '/content/drive/MyDrive/finetuned_model/vocab.txt',
 '/content/drive/MyDrive/finetuned_model/added_tokens.json')

In [None]:
# 9. Load the Test Dataset for Evaluation
sample_df = pd.read_csv('test.csv')
sample_df['sentiment'] = sample_df['sentiment'].apply(lambda x: 1 if x == 'pos' else 0)

# Extract 20% sample for testing
test_df = sample_df.sample(frac=0.2)

# Extract texts and labels
test_texts = test_df['text'].tolist()
test_labels = test_df['sentiment'].tolist()

# Tokenize the test data
test_encodings = tokenize_function(test_texts)

# Create test dataset
test_dataset = CustomDataset(test_encodings, test_labels)

# Evaluate the model on the test set
test_results = trainer.evaluate(test_dataset)
print(f"Test accuracy: {test_results['eval_accuracy']:.4f}")
print(f"Test F1-Score: {test_results['eval_f1']:.4f}")
print(f"Test Precision: {test_results['eval_precision']:.4f}")
print(f"Test Recall: {test_results['eval_recall']:.4f}")



Test accuracy: 0.9400
Test F1-Score: 0.9388
Test Precision: 0.9293
Test Recall: 0.9485


In [None]:
# 10. Prediction Function for New Data
def predict_sentiment(text):
    encoding = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)

    # Move the encoding to the same device as the model
    encoding = {k: v.to(model.device) for k, v in encoding.items()}

    with torch.no_grad():
        outputs = model(**encoding)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
        return "positive" if predicted_class == 1 else "negative"

# Example usage:
new_text = "The movie was fantastic!"
print(f"Sentiment for '{new_text}': {predict_sentiment(new_text)}")

Sentiment for 'The movie was fantastic!': positive


In [None]:
new_text = "I didn't like that film"
print(f"Sentiment for '{new_text}': \n{predict_sentiment(new_text)}")

Sentiment for 'I didn't like that film': 
negative


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from google.colab import drive

def mount_drive():
    drive.mount('/content/drive')

def load_model_from_drive(model_path):
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    return model, tokenizer

def predict_sentiment(text, model, tokenizer):
    encoding = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    encoding = {k: v.to(model.device) for k, v in encoding.items()}  # Move to the correct device

    with torch.no_grad():
        outputs = model(**encoding)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
        return "positive" if predicted_class == 1 else "negative"

# Mount Google Drive
mount_drive()

# Load the model and tokenizer from the specified path
model_path = "/content/drive/MyDrive/finetuned_model"
model, tokenizer = load_model_from_drive(model_path)

# Example usage
text = "The movie was fantastic!"
print(f"Sentiment for '{text}': {predict_sentiment(text, model, tokenizer)}")


Mounted at /content/drive
Sentiment for 'The movie was fantastic!': positive
