In [1]:
!pip install transformers torch sklearn

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import pandas as pd
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import train_test_split

# Helper function to load data
def load_data(file_path):
    return pd.read_csv(file_path, delimiter=';', header=None, names=['text', 'label'])

# Load data
train_data = load_data('/content/train.txt')
val_data = load_data('/content/val.txt')
test_data = load_data('/content/test.txt')

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Dataset class
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        label = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Convert labels to categorical values
label_dict = {}
for index, label in enumerate(train_data['label'].unique()):
    label_dict[label] = index

train_data['label'] = train_data['label'].replace(label_dict)
val_data['label'] = val_data['label'].replace(label_dict)
test_data['label'] = test_data['label'].replace(label_dict)

# Data loaders
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = EmotionDataset(
        texts=df.text.to_numpy(),
        labels=df.label.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(ds, batch_size=batch_size, num_workers=2)

# Parameters
batch_size = 16
max_len = 256

train_loader = create_data_loader(train_data, tokenizer, max_len, batch_size)
val_loader = create_data_loader(val_data, tokenizer, max_len, batch_size)
test_loader = create_data_loader(test_data, tokenizer, max_len, batch_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load BERT with a classification head
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_dict))
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

# Train and evaluate the model
# Include training loop, evaluation, and save the model
# This section will be lengthy and involve setting up the training epochs, handling GPU acceleration, etc.


Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
from transformers import get_linear_schedule_with_warmup
import torch.nn.functional as F

# Define training parameters
epochs = 4
total_steps = len(train_loader) * epochs

# Setup the optimizer and the learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to train the model for one epoch
def train_epoch(model, data_loader, optimizer, device, scheduler):
    model.train()
    total_train_loss = 0
    total_train_accuracy = 0

    for step, batch in enumerate(data_loader):
        batch_input_ids = batch['input_ids'].to(device)
        batch_attention_mask = batch['attention_mask'].to(device)
        batch_labels = batch['labels'].to(device)

        model.zero_grad()
        outputs = model(batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)

        loss = outputs.loss
        logits = outputs.logits
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        preds = torch.argmax(F.softmax(logits, dim=1), dim=1)
        total_train_accuracy += torch.sum(preds == batch_labels)

    average_train_loss = total_train_loss / len(data_loader)
    average_train_accuracy = total_train_accuracy.double() / (len(data_loader) * data_loader.batch_size)
    return average_train_loss, average_train_accuracy

# Function for evaluating the model
def eval_model(model, data_loader, device):
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in data_loader:
        batch_input_ids = batch['input_ids'].to(device)
        batch_attention_mask = batch['attention_mask'].to(device)
        batch_labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)

        loss = outputs.loss
        logits = outputs.logits
        total_eval_loss += loss.item()

        preds = torch.argmax(F.softmax(logits, dim=1), dim=1)
        total_eval_accuracy += torch.sum(preds == batch_labels)

    average_eval_loss = total_eval_loss / len(data_loader)
    average_eval_accuracy = total_eval_accuracy.double() / (len(data_loader) * data_loader.batch_size)
    return average_eval_loss, average_eval_accuracy

# Training and evaluation loop
for epoch in range(epochs):
    train_loss, train_accuracy = train_epoch(model, train_loader, optimizer, device, scheduler)
    print(f"Epoch {epoch + 1} | Train Loss: {train_loss} | Train Accuracy: {train_accuracy}")

    val_loss, val_accuracy = eval_model(model, val_loader, device)
    print(f"Epoch {epoch + 1} | Val Loss: {val_loss} | Val Accuracy: {val_accuracy}")

# Save the model
model_path = "/content/drive/MyDrive/model_save"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


  self.pid = os.fork()
  self.pid = os.fork()


Epoch 1 | Train Loss: 0.4084482608414255 | Train Accuracy: 0.8572500000000001
Epoch 1 | Val Loss: 0.19104178422875703 | Val Accuracy: 0.9345
Epoch 2 | Train Loss: 0.13216075107152575 | Train Accuracy: 0.9454375
Epoch 2 | Val Loss: 0.1788256106246263 | Val Accuracy: 0.9425
Epoch 3 | Train Loss: 0.09278312066517537 | Train Accuracy: 0.962125
Epoch 3 | Val Loss: 0.19652077536517754 | Val Accuracy: 0.9390000000000001
Epoch 4 | Train Loss: 0.06897374678234337 | Train Accuracy: 0.971875
Epoch 4 | Val Loss: 0.2089117198032327 | Val Accuracy: 0.9400000000000001


('/content/drive/MyDrive/model_save/tokenizer_config.json',
 '/content/drive/MyDrive/model_save/special_tokens_map.json',
 '/content/drive/MyDrive/model_save/vocab.txt',
 '/content/drive/MyDrive/model_save/added_tokens.json')

In [None]:
from flask import Flask, request, jsonify
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import torch.nn.functional as F

app = Flask(__name__)

# Load the model and tokenizer
model_path = 'path_to_saved_model'
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
model.eval()

# Define prediction function
def predict_emotion(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    probs = F.softmax(outputs.logits, dim=1)
    return probs.numpy().flatten()

# Define a route to handle web requests
@app.route('/predict', methods=['POST'])
def predict():
    text_data = request.json['text']
    probs = predict_emotion(text_data)
    emotion_labels = ['anger', 'joy', 'sadness', 'fear', ...]  # Adjust based on your model
    overall_emotion = emotion_labels[np.argmax(probs)]
    emotion_distribution = dict(zip(emotion_labels, probs.tolist()))
    return jsonify(overall_emotion=overall_emotion, distribution=emotion_distribution)

if __name__ == '__main__':
    app.run(debug=True)


In [3]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import torch.nn.functional as F

# Load the trained model and tokenizer
model_path = '/content/drive/MyDrive/model_save'
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
model.eval()
model.to('cuda' if torch.cuda.is_available() else 'cpu')

def predict_emotion(input_data):
    # Check if the input data is a filepath
    if isinstance(input_data, str) and input_data.endswith('.txt'):
        with open(input_data, 'r', encoding='utf-8') as file:
            text = file.read()
    else:
        text = input_data

    # Encode and prepare input data
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    input_ids = inputs['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
    attention_mask = inputs['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')

    # Prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        probabilities = F.softmax(outputs.logits, dim=1)
        probs = probabilities.squeeze().tolist()

    # Decode predictions
    emotions = tokenizer.ids_to_labels.keys()  # Ensure you have this mapping saved from training time
    emotion_probs = dict(zip(emotions, probs))
    overall_emotion = max(emotion_probs, key=emotion_probs.get)

    return overall_emotion, emotion_probs

# Example usage
text_input = "I'm feeling very happy today! It's a good day."
overall_emotion, emotion_distribution = predict_emotion(text_input)

# If you want to test with a file
# overall_emotion, emotion_distribution = predict_emotion('path_to_your_file.txt')

print("Overall Emotion:", overall_emotion)
print("Emotion Distribution:", emotion_distribution)


AttributeError: 'BertTokenizer' object has no attribute 'ids_to_labels'

In [4]:
label_dict = {
    'sadness': 0,
    'anger': 1,
    'love': 2,
    'surprise': 3,
    'fear': 4,
    'joy': 5
}

# Reverse the dictionary for mapping numeric IDs back to labels
id_to_label = {v: k for k, v in label_dict.items()}


In [5]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import torch.nn.functional as F

# Load the model and tokenizer
model_path = 'model_save'
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
model.eval()
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

def predict_emotion(input_data):
    if isinstance(input_data, str) and input_data.endswith('.txt'):
        with open(input_data, 'r', encoding='utf-8') as file:
            text = file.read()
    else:
        text = input_data

    # Encode and prepare input data
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    input_ids = inputs['input_ids'].to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    attention_mask = inputs['attention_mask'].to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    # Make prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        probabilities = F.softmax(outputs.logits, dim=1)
        probs = probabilities.squeeze().tolist()

    # Determine the most likely emotion
    predicted_index = torch.argmax(outputs.logits, dim=1).item()
    overall_emotion = id_to_label[predicted_index]
    emotion_distribution = {id_to_label[i]: prob for i, prob in enumerate(probs)}

    return overall_emotion, emotion_distribution

# Example usage
text_input = "I'm feeling very happy today! It's a good day."
overall_emotion, emotion_distribution = predict_emotion(text_input)
print("Overall Emotion:", overall_emotion)
print("Emotion Distribution:", emotion_distribution)


Overall Emotion: joy
Emotion Distribution: {'sadness': 7.412652485072613e-05, 'anger': 4.435436494532041e-05, 'love': 0.0001445561065338552, 'surprise': 7.244136213557795e-05, 'fear': 4.1335875721415505e-05, 'joy': 0.99962317943573}


In [7]:
import pandas as pd

def test_model(test_file_path, model, tokenizer):
    # Load the test data
    test_data = pd.read_csv(test_file_path, delimiter=';', header=None, names=['text', 'true_label'])

    # Prepare to collect predictions
    predictions = []
    true_labels = []
    emotion_distributions = []

    for index, row in test_data.iterrows():
        text = row['text']
        true_label = row['true_label']
        predicted_emotion, emotion_distribution = predict_emotion(text)

        # Store results
        predictions.append(predicted_emotion)
        true_labels.append(true_label)
        emotion_distributions.append(emotion_distribution)

        # Print the results for each text
        print(f"Text: {text}")
        print(f"Predicted Emotion: {predicted_emotion}")
        print(f"Emotion Distribution: {emotion_distribution}\n")

    # Calculate accuracy if true labels are available and labels are in `label_dict`
    if all(label in label_dict for label in true_labels):
        accuracy = sum(1 for i in range(len(predictions)) if predictions[i] == true_labels[i]) / len(predictions)
        print(f"Accuracy: {accuracy * 100:.2f}%")

# Example usage, assuming the paths and model are correctly set up
test_file_path = '/content/test.txt'
test_model(test_file_path, model, tokenizer)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Predicted Emotion: sadness
Emotion Distribution: {'sadness': 0.9996460676193237, 'anger': 0.0001283595192944631, 'love': 4.2530311475275084e-05, 'surprise': 6.782468699384481e-05, 'fear': 6.29233181825839e-05, 'joy': 5.228963709669188e-05}

Text: i enjoy about his work is the genuine feel and the pleasant message he is trying to deliver with all this
Predicted Emotion: joy
Emotion Distribution: {'sadness': 6.830722850281745e-05, 'anger': 3.7669022276531905e-05, 'love': 0.0001331923995167017, 'surprise': 7.502189691876993e-05, 'fear': 4.4531865569297224e-05, 'joy': 0.9996411800384521}

Text: i knew except they ve lost that girly feeling and gained a graceful wisdom
Predicted Emotion: joy
Emotion Distribution: {'sadness': 9.224736277246848e-05, 'anger': 4.6016983105801046e-05, 'love': 0.0001185972651001066, 'surprise': 9.385620069224387e-05, 'fear': 4.9984078941633925e-05, 'joy': 0.999599277973175}

Text: i am feeling a bit

In [12]:
!zip -r ./content/emotion_essay_model.zip ./content


zip error: Nothing to do! (try: zip -r ./content/emotion_essay_model.zip . -i ./content)


In [13]:
!zip -r content.zip /content/

  adding: content/ (stored 0%)
  adding: content/.config/ (stored 0%)
  adding: content/.config/configurations/ (stored 0%)
  adding: content/.config/configurations/config_default (deflated 15%)
  adding: content/.config/gce (stored 0%)
  adding: content/.config/.last_opt_in_prompt.yaml (stored 0%)
  adding: content/.config/.last_survey_prompt.yaml (stored 0%)
  adding: content/.config/default_configs.db (deflated 98%)
  adding: content/.config/config_sentinel (stored 0%)
  adding: content/.config/logs/ (stored 0%)
  adding: content/.config/logs/2024.04.17/ (stored 0%)
  adding: content/.config/logs/2024.04.17/13.28.43.997536.log (deflated 91%)
  adding: content/.config/logs/2024.04.17/13.29.19.698756.log (deflated 86%)
  adding: content/.config/logs/2024.04.17/13.29.27.757827.log (deflated 58%)
  adding: content/.config/logs/2024.04.17/13.29.38.297557.log (deflated 56%)
  adding: content/.config/logs/2024.04.17/13.29.37.622123.log (deflated 57%)
  adding: content/.config/logs/2024.04.

In [14]:
from google.colab import files
files.download('new_content.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
!zip -r model_download.zip /content/ -x "*sample_data*"


  adding: content/ (stored 0%)
  adding: content/.config/ (stored 0%)
  adding: content/.config/configurations/ (stored 0%)
  adding: content/.config/configurations/config_default (deflated 15%)
  adding: content/.config/gce (stored 0%)
  adding: content/.config/.last_opt_in_prompt.yaml (stored 0%)
  adding: content/.config/.last_survey_prompt.yaml (stored 0%)
  adding: content/.config/default_configs.db (deflated 98%)
  adding: content/.config/config_sentinel (stored 0%)
  adding: content/.config/logs/ (stored 0%)
  adding: content/.config/logs/2024.04.17/ (stored 0%)
  adding: content/.config/logs/2024.04.17/13.28.43.997536.log (deflated 91%)
  adding: content/.config/logs/2024.04.17/13.29.19.698756.log (deflated 86%)
  adding: content/.config/logs/2024.04.17/13.29.27.757827.log (deflated 58%)
  adding: content/.config/logs/2024.04.17/13.29.38.297557.log (deflated 56%)
  adding: content/.config/logs/2024.04.17/13.29.37.622123.log (deflated 57%)
  adding: content/.config/logs/2024.04.

In [19]:
!zip -r new_content.zip /content/ -x "*sample_data*" "*content.zip*" "*model_downlaod.zip*"


  adding: content/ (stored 0%)
  adding: content/.config/ (stored 0%)
  adding: content/.config/configurations/ (stored 0%)
  adding: content/.config/configurations/config_default (deflated 15%)
  adding: content/.config/gce (stored 0%)
  adding: content/.config/.last_opt_in_prompt.yaml (stored 0%)
  adding: content/.config/.last_survey_prompt.yaml (stored 0%)
  adding: content/.config/default_configs.db (deflated 98%)
  adding: content/.config/config_sentinel (stored 0%)
  adding: content/.config/logs/ (stored 0%)
  adding: content/.config/logs/2024.04.17/ (stored 0%)
  adding: content/.config/logs/2024.04.17/13.28.43.997536.log (deflated 91%)
  adding: content/.config/logs/2024.04.17/13.29.19.698756.log (deflated 86%)
  adding: content/.config/logs/2024.04.17/13.29.27.757827.log (deflated 58%)
  adding: content/.config/logs/2024.04.17/13.29.38.297557.log (deflated 56%)
  adding: content/.config/logs/2024.04.17/13.29.37.622123.log (deflated 57%)
  adding: content/.config/logs/2024.04.

In [20]:
from google.colab import files
files.download('new_content.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
files.download('/content/drive/MyDrive/model_save/model.safetensors')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [22]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [23]:
!cp -r /content/drive_n/MyDrive/* /content/drive/MyDrive/


In [None]:
from google.colab import drive
drive.mount('/content/drive')