In [None]:
%pip install torch transformers datasets evaluate

In [1]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, load_dataset
import evaluate
from transformers import CamembertTokenizer, CamembertForSequenceClassification, Trainer, TrainingArguments




### Loading Data

In [7]:
df_b24 = pd.read_json("burkina24_training_data.json")
df_b24.head()

Unnamed: 0,text,label,title,author,date,url,category_raw,media,image_url,tags,scraped_at
0,"Dans le cadre du mois d‚ÄôOctobre Rose\n\n, Oran...",Sant√©,Octobre Rose : La Fondation Orange Burkina Fas...,Akim Ky,2025-10-20 19:07:44,https://burkina24.com/2025/10/20/octobre-rose-...,actualite/societe/sante,Burkina24,https://burkina24.com/wp-content/uploads/2025/...,[],2025-11-16 13:04:46.207099
1,"Le projet PLURIELLES, financ√© par Affaires mon...",Sant√©,Projet PLURIELLES : Un pas de g√©ant pour la re...,Flora Toelo Karambiri,2025-09-23 17:30:02,https://burkina24.com/2025/09/23/projet-plurie...,actualite/societe/sante,Burkina24,https://burkina24.com/wp-content/uploads/2025/...,[],2025-11-16 13:04:47.021270
2,L‚Äôautrice de la bande dessin√©e ¬´ Une vie pour ...,Sant√©,Lutte contre la d√©pigmentation au Burkina Faso...,Abdoul Gani Barry,2025-08-16 10:30:25,https://burkina24.com/2025/08/16/lutte-contre-...,actualite/societe/sante,Burkina24,https://burkina24.com/wp-content/uploads/2023/...,[],2025-11-16 13:04:47.911046
3,"Le dimanche 17 ao√ªt 2025, l‚Äôassociation Laafi ...",Sant√©,Sant√© Publique : L‚ÄôAssociation Laafi sans fron...,R√©daction B24,2025-08-18 08:31:57,https://burkina24.com/2025/08/18/sante-publiqu...,actualite/societe/sante,Burkina24,https://burkina24.com/wp-content/uploads/2025/...,[],2025-11-16 13:04:49.285769
4,"Chaque 28 juillet, la communaut√© international...",Sant√©,Sant√© : Le Burkina Faso comm√©more la journ√©e m...,Willy SAGBE,2025-08-18 14:24:54,https://burkina24.com/2025/08/18/sante-le-burk...,actualite/societe/sante,Burkina24,https://burkina24.com/wp-content/uploads/2025/...,[],2025-11-16 13:04:50.449480


In [19]:
# required categories
categories = ["Politique", "√âconomie", "S√©curit√©", "Sant√©", "Culture", "Sport", "Autres"]

#### b24 column mapping

In [11]:
df_b24["label"].value_counts()

label
√âconomie     160
S√©curit√©     155
Sport        154
Sant√©        152
Politique    145
Culture      136
Name: count, dtype: int64

In [None]:
df_b24["label"] = df_b24["label"].apply(
    lambda x: x if x in categories else "Autres"
)
# -> No label mapped to "Autres"
df_b24["label"].value_counts()

label
√âconomie     160
S√©curit√©     155
Sport        154
Sant√©        152
Politique    145
Culture      136
Name: count, dtype: int64

#### Fasonet mapping

In [12]:
df_faso = pd.read_json("lefaso_training_data.json")
df_faso["label"].value_counts()

label
International    400
Politique        200
√âconomie         200
Soci√©t√©          200
Sport            200
Culture          200
Name: count, dtype: int64

In [None]:
df_faso["label"] = df_faso["label"].apply(
    lambda x: x if x in categories else "Autres"
)
# -> "International" and "Soci√©t√©" mapped to "Autres"
df_faso["label"].value_counts()

label
Autres       600
Politique    200
√âconomie     200
Sport        200
Culture      200
Name: count, dtype: int64

#### Sidwaya mapping

In [14]:
df_sdya = pd.read_json("sidwaya_training_data.json")
df_sdya["label"].value_counts()

label
Sport       377
S√©curit√©     78
Name: count, dtype: int64

In [None]:
df_sdya["label"] = df_sdya["label"].apply(
    lambda x: x if x in categories else "Autres"
)
# -> No label mapped to "Autres"
df_sdya["label"].value_counts()

label
Sport       377
S√©curit√©     78
Name: count, dtype: int64

#### Merging three datasets

In [23]:
df_merged = pd.concat([df_faso, df_b24, df_sdya], ignore_index=True)
df_merged

Unnamed: 0,text,label,title,author,date,url,category_raw,media,image_url,tags,scraped_at
0,"Parmi ces d√©cisions, il y a le bilan √† mi-parc...",Politique,Burkina Faso : Certaines grandes entreprises o...,Lefaso,NaT,https://lefaso.net/spip.php?article141410,politique,Lefaso.net,,[],2025-11-16 12:34:15.912026
1,"Le premier ministre, Rimtalba Jean Emmanuel O...",Politique,Burkina/√âvaluation des ministres : La ministre...,Lefaso,NaT,https://lefaso.net/spip.php?article139922,politique,Lefaso.net,,[],2025-11-16 12:34:17.220251
2,"Photo d‚Äôillustration\n\nIl est 10 h, lorsque l...",Politique,Burkina/ Assembl√©e l√©gislative de Transition :...,Lefaso,NaT,https://lefaso.net/spip.php?article138343,politique,Lefaso.net,,[],2025-11-16 12:34:18.107721
3,"Cette visite intervient apr√®s les propos, d√©bu...",Politique,Diplomatie : Un √©missaire du pr√©sident Donald ...,Lefaso,NaT,https://lefaso.net/spip.php?article138459,politique,Lefaso.net,,[],2025-11-16 12:34:19.970300
4,Le 30 mai 2017 s‚Äô√©teignait en France Dr Val√®re...,Politique,"T√©moignage : Val√®re, mon ma√Ætre id√©ologique et...",Lefaso,NaT,https://lefaso.net/spip.php?article138433,politique,Lefaso.net,,[],2025-11-16 12:34:21.030728
...,...,...,...,...,...,...,...,...,...,...,...
2752,L‚ÄôAssociation sportive des Douanes a battu en ...,Sport,30e √©dition Super coupe AJSB : les Gabelous co...,JK. Sidwaya,2023-08-06 22:41:18,https://www.sidwaya.info/30e-edition-super-cou...,Sport,Sidwaya,https://www.sidwaya.info/wp-content/uploads/20...,[],2025-11-16 15:49:32.091149
2753,Malmen√©e par la presse fran√ßaise pour son hygi...,Sport,Al Hilal: Les supporters s‚Äôarrachent le maillo...,Wamini SIDWAYA,2023-08-16 12:54:58,https://www.sidwaya.info/al-hilal-les-supporte...,Sport,Sidwaya,https://www.sidwaya.info/wp-content/uploads/20...,[],2025-11-16 15:49:33.441009
2754,La F√©d√©ration burkinab√® de football (FBF) a or...,Sport,Assembl√©e g√©n√©rale ordinaire de la FBF: les ac...,JK. Sidwaya,2023-08-20 22:10:38,https://www.sidwaya.info/assemblee-generale-or...,Sport,Sidwaya,https://www.sidwaya.info/wp-content/uploads/20...,[],2025-11-16 15:49:34.679212
2755,"L‚Äôhistoire retiendra, ce 21 ao√ªt 2023 que Hugu...",Sport,Mondiaux d‚Äôathl√©tisme: de l‚Äôor historique pour...,JK. Sidwaya,2023-08-21 22:34:06,https://www.sidwaya.info/mondiaux-dathletisme-...,Sport,Sidwaya,https://www.sidwaya.info/wp-content/uploads/20...,[],2025-11-16 15:49:35.710802


In [24]:
df_merged["label"].value_counts()

label
Sport        731
Autres       600
√âconomie     360
Politique    345
Culture      336
S√©curit√©     233
Sant√©        152
Name: count, dtype: int64

In [28]:
df_merged.to_json("final_dataset.json", indent=2)

#### Training

In [29]:
import torch
# import pandas as pd

# Your label counts
label_counts = {
    "Sport": 731,
    "Autres": 600,
    "√âconomie": 360,
    "Politique": 345,
    "Culture": 336,
    "S√©curit√©": 233,
    "Sant√©": 152
}

# Map labels to IDs
label_to_id = {label: i for i, label in enumerate(label_counts.keys())}
id_to_label = {v: k for k, v in label_to_id.items()}

num_labels = len(label_counts)

# Convert counts into weights
counts = torch.tensor(list(label_counts.values()), dtype=torch.float)
class_weights = 1.0 / counts
class_weights = class_weights / class_weights.sum() * num_labels  # normalize

print("Class weights:", class_weights)

Class weights: tensor([0.4245, 0.5172, 0.8620, 0.8995, 0.9235, 1.3318, 2.0415])


In [32]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df_merged,
    test_size=0.2,
    stratify=df_merged["label"],   # üî• ensures balanced categories
    random_state=42
)

# val_df, test_df = train_test_split(
#     test_df,
#     test_size=0.5,
#     stratify=test_df["label"],   # üî• ensures balanced categories
#     random_state=42
# )

In [33]:
from torch.utils.data import Dataset, DataLoader
from transformers import CamembertTokenizer

tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

class NewsDataset(Dataset):
    def __init__(self, df):
        self.texts = df["text"].tolist()
        self.labels = df["label"].map(label_to_id).tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=256
        )
        return {
            "input_ids": torch.tensor(encoding["input_ids"]),
            "attention_mask": torch.tensor(encoding["attention_mask"]),
            "labels": torch.tensor(self.labels[idx])
        }

train_dataset = NewsDataset(train_df)
val_dataset = NewsDataset(val_df)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]



In [34]:
from transformers import CamembertForSequenceClassification
import torch.nn as nn

model = CamembertForSequenceClassification.from_pretrained(
    "camembert-base",
    num_labels=num_labels
)

# Weighted CrossEntropyLoss
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion.to(device)

for epoch in range(10):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=None  # we compute loss manually
        )

        logits = outputs.logits
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} - Training Loss: {total_loss/len(train_loader):.4f}")


KeyboardInterrupt: 

In [None]:
df["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,607
2,299
0,94


In [None]:
# ‚úÖ Load CamemBERT tokenizer
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# ‚úÖ Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [None]:
# ‚úÖ Split dataset: 80% train, 20% validation
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# train_df = df.iloc[:800]  # First 800 rows for training
# val_df = df.iloc[800:]    # Last 200 rows for validation

# ‚úÖ Convert Pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
# ‚úÖ Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
# ‚úÖ Load CamemBERT model (3 classes: Negative, Neutral, Positive)
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=3)


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.707877
2,No log,0.570179
3,No log,0.538728
4,No log,0.567626
5,0.481000,0.603477
6,0.481000,0.571581
7,0.481000,0.593409
8,0.481000,0.614455
9,0.481000,0.617528
10,0.134400,0.63072


TrainOutput(global_step=1000, training_loss=0.30769239044189456, metrics={'train_runtime': 886.6395, 'train_samples_per_second': 9.023, 'train_steps_per_second': 1.128, 'total_flos': 2104907341824000.0, 'train_loss': 0.30769239044189456, 'epoch': 10.0})

In [None]:

# ‚úÖ Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none"
)

# ‚úÖ Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# ‚úÖ Train the model
trainer.train()

In [None]:
def predict_sentiment(sentence):
    # ‚úÖ Tokenize input sentence
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Move input tensors to the same device as the model
    inputs = inputs.to(model.device)

    # ‚úÖ Perform inference (disable gradient calculation for efficiency)
    with torch.no_grad():
        outputs = model(**inputs)

    # ‚úÖ Get predicted class (0 = Negative, 1 = Neutral, 2 = Positive)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()

    # ‚úÖ Convert class index to label
    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return label_map[predicted_class]


In [None]:
print(predict_sentiment("J'adore ce produit, il est incroyable !"))  # Positive
print(predict_sentiment("C'est un produit moyen, pas mal mais rien d'exceptionnel."))  # Neutral
print(predict_sentiment("Je d√©teste ce produit, il est horrible."))  # Negative

Positive
Positive
Negative


In [None]:
print(predict_sentiment("Mauvais"))  # Negative
print(predict_sentiment("Bien"))  # Positive

Negative
Positive


In [None]:
# %%capture
# !pip install huggingface_hub

In [None]:
# ‚úÖ Define your Hugging Face repository name
repo_name = "Minervus00/camembert-fb-sentiment"

# ‚úÖ Save the model and tokenizer
model.save_pretrained(repo_name)
tokenizer.save_pretrained(repo_name)

('Minervus00/camembert-fb-sentiment/tokenizer_config.json',
 'Minervus00/camembert-fb-sentiment/special_tokens_map.json',
 'Minervus00/camembert-fb-sentiment/sentencepiece.bpe.model',
 'Minervus00/camembert-fb-sentiment/added_tokens.json')

In [None]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_WRITE')

In [None]:
# ‚úÖ Push model to Hugging Face Hub
model.push_to_hub(repo_name, token=HF_TOKEN)
tokenizer.push_to_hub(repo_name, token=HF_TOKEN)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/Minervus00/camembert-fb-sentiment/commit/21201e57278d8086e641e0c6b3d8a0c7d547329f', commit_message='Upload tokenizer', commit_description='', oid='21201e57278d8086e641e0c6b3d8a0c7d547329f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Minervus00/camembert-fb-sentiment', endpoint='https://huggingface.co', repo_type='model', repo_id='Minervus00/camembert-fb-sentiment'), pr_revision=None, pr_num=None)

In [None]:
from transformers import CamembertForSequenceClassification, CamembertTokenizer
import torch

# ‚úÖ Load model & tokenizer from Hugging Face
repo_name = "Minervus00/camembert-fb-sentiment"  # Replace with your repo
model = CamembertForSequenceClassification.from_pretrained(repo_name)
tokenizer = CamembertTokenizer.from_pretrained(repo_name)

# ‚úÖ Set model to evaluation mode
model.eval()

# ‚úÖ Function to predict sentiment
def predict_sentiment(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)

    with torch.no_grad():
        outputs = model(**inputs)

    predicted_class = torch.argmax(outputs.logits, dim=1).item()

    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return label_map[predicted_class]

# ‚úÖ Test predictions
print(predict_sentiment("J'adore ce produit, il est incroyable !"))  # Positive
print(predict_sentiment("C'est un produit moyen, pas mal mais rien d'exceptionnel."))  # Neutral
print(predict_sentiment("Je d√©teste ce produit, il est horrible."))  # Negative

config.json:   0%|          | 0.00/918 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.81k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/374 [00:00<?, ?B/s]

Neutral
Neutral
Neutral


### Semi-supervised

In [None]:
df2 = pd.read_csv("second_step.csv")
df2

Unnamed: 0,post,comment,likes
0,Notre arm√©e est notre fiert√©. üáßüá´üá≤üá±üá≥üá™,"Sans doute, beaucoup copieront des bons exempl...",11
1,Notre arm√©e est notre fiert√©. üáßüá´üá≤üá±üá≥üá™,Une arm√©e devant laquelle les tero tero endeui...,8
2,Notre arm√©e est notre fiert√©. üáßüá´üá≤üá±üá≥üá™,Vive mon faso et qu Allah ns prot√®ge tous cour...,0
3,Notre arm√©e est notre fiert√©. üáßüá´üá≤üá±üá≥üá™,Vive notre arm√©e üáßüá´üá≤üá±üá≥üá™,10
4,Notre arm√©e est notre fiert√©. üáßüá´üá≤üá±üá≥üá™,Nous c'est notre pr√©sident qui est notre fiert...,8
...,...,...,...
1995,Si je n‚Äôai pas eu Kund√© cette ann√©e j‚Äôarr√™te d...,C'est cool,0
1996,Si je n‚Äôai pas eu Kund√© cette ann√©e j‚Äôarr√™te d...,Ouais big up. J'attends ma commission en tant ...,0
1997,Si je n‚Äôai pas eu Kund√© cette ann√©e j‚Äôarr√™te d...,Sinc√®rement tu m√©rites le cound√© cette ann√©e,0
1998,Si je n‚Äôai pas eu Kund√© cette ann√©e j‚Äôarr√™te d...,En mode 1980,0


In [None]:
train_df2 = df2.loc[:1499, ]
test_df2 = df2.loc[1500:, ]

In [None]:
train_df2["label"] = train_df2["comment"].apply(predict_sentiment)
train_df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df2["label"] = train_df2["comment"].apply(predict_sentiment)


Unnamed: 0,post,comment,likes,label
0,Notre arm√©e est notre fiert√©. üáßüá´üá≤üá±üá≥üá™,"Sans doute, beaucoup copieront des bons exempl...",11,Positive
1,Notre arm√©e est notre fiert√©. üáßüá´üá≤üá±üá≥üá™,Une arm√©e devant laquelle les tero tero endeui...,8,Neutral
2,Notre arm√©e est notre fiert√©. üáßüá´üá≤üá±üá≥üá™,Vive mon faso et qu Allah ns prot√®ge tous cour...,0,Positive
3,Notre arm√©e est notre fiert√©. üáßüá´üá≤üá±üá≥üá™,Vive notre arm√©e üáßüá´üá≤üá±üá≥üá™,10,Neutral
4,Notre arm√©e est notre fiert√©. üáßüá´üá≤üá±üá≥üá™,Nous c'est notre pr√©sident qui est notre fiert...,8,Positive
...,...,...,...,...
1495,Franchement √ßa n‚Äôa pas √©t√© simple pour Flobyüòé,En tout cas pour toi √©tait bonüòÖ,2,Positive
1496,Franchement √ßa n‚Äôa pas √©t√© simple pour Flobyüòé,Ni pour l'autre ü§£ü§£ü§£,0,Neutral
1497,Franchement √ßa n‚Äôa pas √©t√© simple pour Flobyüòé,C'est vraiment pas facile de sourire lorsqu'on...,0,Positive
1498,Franchement √ßa n‚Äôa pas √©t√© simple pour Flobyüòé,Lo vieuxüòπüòπ,0,Neutral


In [None]:
train_df2 = train_df2.drop(columns=["post", "likes"])
train_df2 = train_df2.rename(columns={"comment": "text"})
train_df2

Unnamed: 0,text,label
0,"Sans doute, beaucoup copieront des bons exempl...",Positive
1,Une arm√©e devant laquelle les tero tero endeui...,Neutral
2,Vive mon faso et qu Allah ns prot√®ge tous cour...,Positive
3,Vive notre arm√©e üáßüá´üá≤üá±üá≥üá™,Neutral
4,Nous c'est notre pr√©sident qui est notre fiert...,Positive
...,...,...
1495,En tout cas pour toi √©tait bonüòÖ,Positive
1496,Ni pour l'autre ü§£ü§£ü§£,Neutral
1497,C'est vraiment pas facile de sourire lorsqu'on...,Positive
1498,Lo vieuxüòπüòπ,Neutral


In [None]:
train_df2["label"] = train_df2["label"].map({"Negative": 0, "Neutral": 1, "Positive": 2})
train_df2.head()

Unnamed: 0,text,label
0,"Sans doute, beaucoup copieront des bons exempl...",2
1,Une arm√©e devant laquelle les tero tero endeui...,1
2,Vive mon faso et qu Allah ns prot√®ge tous cour...,2
3,Vive notre arm√©e üáßüá´üá≤üá±üá≥üá™,1
4,Nous c'est notre pr√©sident qui est notre fiert...,2


In [None]:
train_df2["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,795
2,485
0,220


In [None]:
from sklearn.model_selection import train_test_split
# ‚úÖ Split dataset: 80% train, 20% validation
train_set, val_set = train_test_split(train_df2, test_size=0.2, random_state=42)

# ‚úÖ Convert Pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_set)
val_dataset = Dataset.from_pandas(val_set)

In [None]:
# ‚úÖ Tokenization function
# def tokenize_function(examples):
#     return tokenizer(examples["text"], padding="max_length", truncation=True)

In [None]:
# ‚úÖ Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [None]:
# ‚úÖ Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none"
)

# ‚úÖ Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# ‚úÖ Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.194076
2,No log,0.184031
3,No log,0.21346
4,0.228800,0.349211
5,0.228800,0.323455
6,0.228800,0.344875
7,0.044600,0.31484
8,0.044600,0.351504
9,0.044600,0.39745
10,0.014600,0.396732


TrainOutput(global_step=1500, training_loss=0.09597844632466634, metrics={'train_runtime': 1293.2108, 'train_samples_per_second': 9.279, 'train_steps_per_second': 1.16, 'total_flos': 3157361012736000.0, 'train_loss': 0.09597844632466634, 'epoch': 10.0})

In [None]:
model.eval()

# ‚úÖ Function to predict sentiment
def predict_sentiment(sentence):
    # ‚úÖ Tokenize input sentence
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Move input tensors to the same device as the model
    inputs = inputs.to(model.device)

    # ‚úÖ Perform inference (disable gradient calculation for efficiency)
    with torch.no_grad():
        outputs = model(**inputs)

    # ‚úÖ Get predicted class (0 = Negative, 1 = Neutral, 2 = Positive)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()

    # ‚úÖ Convert class index to label
    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return label_map[predicted_class]

# ‚úÖ Test predictions
print(predict_sentiment("J'adore ce produit, il est incroyable !"))  # Positive
print(predict_sentiment("C'est un produit moyen, pas mal mais rien d'exceptionnel."))  # Neutral
print(predict_sentiment("Je d√©teste ce produit, il est horrible."))  # Negative

Positive
Positive
Negative


In [None]:
test_df2["label"] = test_df2["comment"].apply(predict_sentiment)
test_df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df2["label"] = test_df2["comment"].apply(predict_sentiment)


Unnamed: 0,post,comment,likes,label
1500,Si je n‚Äôai pas eu Kund√© cette ann√©e j‚Äôarr√™te d...,Yam bigapamin biiii,335,Neutral
1501,Si je n‚Äôai pas eu Kund√© cette ann√©e j‚Äôarr√™te d...,Kund√© d‚Äôor en t√©l√©chargement üòÇ,88,Neutral
1502,Si je n‚Äôai pas eu Kund√© cette ann√©e j‚Äôarr√™te d...,ü§£ü§£ü§£ wala√Ø il est trop fort. Que Dieu te b√©niss...,30,Positive
1503,Si je n‚Äôai pas eu Kund√© cette ann√©e j‚Äôarr√™te d...,"El Pr√©sidente Bf ah tu es bon champion, qu'Al...",22,Positive
1504,Si je n‚Äôai pas eu Kund√© cette ann√©e j‚Äôarr√™te d...,Bigap ü§£,22,Neutral
...,...,...,...,...
1995,Si je n‚Äôai pas eu Kund√© cette ann√©e j‚Äôarr√™te d...,C'est cool,0,Positive
1996,Si je n‚Äôai pas eu Kund√© cette ann√©e j‚Äôarr√™te d...,Ouais big up. J'attends ma commission en tant ...,0,Positive
1997,Si je n‚Äôai pas eu Kund√© cette ann√©e j‚Äôarr√™te d...,Sinc√®rement tu m√©rites le cound√© cette ann√©e,0,Positive
1998,Si je n‚Äôai pas eu Kund√© cette ann√©e j‚Äôarr√™te d...,En mode 1980,0,Neutral


In [None]:
test_df2["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Positive,293
Neutral,172
Negative,35


In [None]:
val = test_df2.drop(columns=["post", "likes"])
val = val.rename(columns={"comment": "text"})
val

Unnamed: 0,text,label
1500,Yam bigapamin biiii,Neutral
1501,Kund√© d‚Äôor en t√©l√©chargement üòÇ,Neutral
1502,ü§£ü§£ü§£ wala√Ø il est trop fort. Que Dieu te b√©niss...,Positive
1503,"El Pr√©sidente Bf ah tu es bon champion, qu'Al...",Positive
1504,Bigap ü§£,Neutral
...,...,...
1995,C'est cool,Positive
1996,Ouais big up. J'attends ma commission en tant ...,Positive
1997,Sinc√®rement tu m√©rites le cound√© cette ann√©e,Positive
1998,En mode 1980,Neutral


In [None]:
val.to_csv("final_pred.csv", index=False)

In [None]:
# ‚úÖ Define your Hugging Face repository name
repo_name = "Minervus00/camembert-fb-sentiment-2"

# ‚úÖ Save the model and tokenizer
model.save_pretrained(repo_name)
tokenizer.save_pretrained(repo_name)

('Minervus00/camembert-fb-sentiment-2/tokenizer_config.json',
 'Minervus00/camembert-fb-sentiment-2/special_tokens_map.json',
 'Minervus00/camembert-fb-sentiment-2/sentencepiece.bpe.model',
 'Minervus00/camembert-fb-sentiment-2/added_tokens.json')

In [None]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_WRITE')

In [None]:
# ‚úÖ Push model to Hugging Face Hub
model.push_to_hub(repo_name, token=HF_TOKEN)
tokenizer.push_to_hub(repo_name, token=HF_TOKEN)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Minervus00/camembert-fb-sentiment-2/commit/5e7a6146dbf635a4091d954e9c23b64ec77904d1', commit_message='Upload tokenizer', commit_description='', oid='5e7a6146dbf635a4091d954e9c23b64ec77904d1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Minervus00/camembert-fb-sentiment-2', endpoint='https://huggingface.co', repo_type='model', repo_id='Minervus00/camembert-fb-sentiment-2'), pr_revision=None, pr_num=None)

In [None]:
from transformers import CamembertForSequenceClassification, CamembertTokenizer
import torch

# ‚úÖ Load model & tokenizer from Hugging Face
repo_name = "Minervus00/camembert-fb-sentiment-2"  # Replace with your repo
model = CamembertForSequenceClassification.from_pretrained(repo_name)
tokenizer = CamembertTokenizer.from_pretrained(repo_name)

# ‚úÖ Set model to evaluation mode
model.eval()

In [None]:
def predict_sentiment(sentence):
    # ‚úÖ Tokenize input sentence
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Move input tensors to the same device as the model
    inputs = inputs.to(model.device)

    # ‚úÖ Perform inference (disable gradient calculation for efficiency)
    with torch.no_grad():
        outputs = model(**inputs)

    # ‚úÖ Get predicted class (0 = Negative, 1 = Neutral, 2 = Positive)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()

    # ‚úÖ Convert class index to label
    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return label_map[predicted_class]

# ‚úÖ Test predictions
print(predict_sentiment("J'adore ce produit, il est incroyable !"))  # Positive
print(predict_sentiment("C'est un produit moyen, pas mal mais rien d'exceptionnel."))  # Neutral
print(predict_sentiment("Je d√©teste ce produit, il est horrible."))  # Negative