In [None]:
from transformers import pipeline

model_name = "camembert-base" # camembert-base
classifier = pipeline( model=model_name)

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:


from transformers import AutoTokenizer, AutoModelForSequenceClassification

num_labels=4
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, problem_type="multi_label_classification").to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name,  problem_type="multi_label_classification")


Now that we’ve tokenized our movie reviews, the next step is to group them all together and split the result into chunks. But how big should these chunks be?

In [None]:
import pandas as pd

# Read the data from the file
data_garmin_df = pd.read_csv('data/Garmin_Connect.csv')
data_samsung_df = pd.read_csv('data/Samsung_Health.csv')
data_huawei_df = pd.read_csv('data/Huawei_Health.csv')

data = pd.concat([data_garmin_df, data_samsung_df, data_huawei_df], ignore_index=True)

print(data.head())

## Tokenizing

In [None]:
def tokenize_function(example):
    return tokenizer(example, padding="max_length", truncation=True)


data['list'] = data[data.columns[2:]].values.tolist()
new_df = data[['data', 'list']].copy()
new_df

In [6]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 1e-05



In [None]:

from torch.utils.data import Dataset

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.data
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': torch.tensor(self.targets[index], dtype=torch.float)
        }

## Train-test split

In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)



print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)


## Defining evaluation metrics with Evaluate module


In [7]:
from transformers import TrainingArguments
import numpy as np

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=EPOCHS, save_strategy="epoch",
                         learning_rate=LEARNING_RATE, per_device_train_batch_size=TRAIN_BATCH_SIZE, per_device_eval_batch_size=VALID_BATCH_SIZE
                , load_best_model_at_end=True, metric_for_best_model="accuracy", logging_dir='logs', logging_first_step=True, do_train=True, do_eval=True)

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions, eval_pred.label_ids

    predictions = np.vectorize(lambda x: 1 if x >= 0.2 else 0)(predictions) # Apply threshold (0.5) to raw probabilities to get labels

    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='macro',zero_division=0)
    recall = recall_score(labels, predictions, average='macro', zero_division=0)
    f1 = f1_score(labels, predictions, average='macro',zero_division=0)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_set,
    eval_dataset=testing_set,
    compute_metrics=compute_metrics,
)



In [None]:
trainer.train()


In [None]:
trainer.evaluate()

### Save the model and retrieved metrics

In [None]:
trainer.save_model("./models")
trainer.save_metrics("eval", trainer.evaluate())

### Config the model's labels

In [3]:
# Define your label list
label_list = ['rating', 'bug_report', 'feature_request', 'user_experience']

# Set the label mapping in the model's config
model.config.label2id = {label: i for i, label in enumerate(label_list)}
model.config.id2label = {i: label for i, label in enumerate(label_list)}

In [12]:
# 0 -> rating
# 1 -> bug_report
# 2 -> feature_request
# 3 -> user_experience

from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from torch.utils.data import DataLoader


test_texts = ["Cest tré bien!", "Je ne arrive pas a faire login!! Merde!","Je encontré un bug", "Je voudrais avoir plus de fonctionnalités"]
test_labels = [[1, 0, 0, 0], [1, 0, 0, 0],[0,1,0,0],[0,0,1,0] ]

model_name = "camembert-base"
#load model from ./models folder

model = AutoModelForSequenceClassification.from_pretrained("./models")
tokenizer = AutoTokenizer.from_pretrained(model_name,  problem_type="multi_label_classification")

# Make predictions
loaded_model = pipeline("text-classification",model=model, tokenizer=tokenizer, return_all_scores=True)
predictions = loaded_model(test_texts)

# Set the threshold for label prediction
threshold = 0.5
predictions = [ [predict for predict in individual_prediction if predict["score"] > threshold] for individual_prediction in predictions]

for i,text in enumerate(test_texts):
    print(text)
    print(predictions[i])
    print("")

predictions_one_hot = []
for prediction in predictions:
    one_hot = [0,0,0,0]
    for predict in prediction:
        one_hot[label_list.index(predict["label"])] = 1
    predictions_one_hot.append(one_hot)
# one hot encode the predictions


# Compute metrics
accuracy = accuracy_score(test_labels, predictions_one_hot)
precision = precision_score(test_labels, predictions_one_hot, average='macro',zero_division=0)
recall = recall_score(test_labels, predictions_one_hot, average='macro', zero_division=0)
f1 = f1_score(test_labels, predictions_one_hot, average='macro',zero_division=0)

print("\nAccuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1: ", f1)






Cest tré bien!
[{'label': 'rating', 'score': 0.94001704454422}]

Je ne arrive pas a faire login!! Merde!
[{'label': 'bug_report', 'score': 0.9031999707221985}]

Je encontré un bug
[{'label': 'bug_report', 'score': 0.9208457469940186}]

Je voudrais avoir plus de fonctionnalités
[{'label': 'feature_request', 'score': 0.7491875290870667}]


Accuracy:  0.75
Precision:  0.625
Recall:  0.625
F1:  0.5833333333333333
