In [None]:
# Install necessary packages
%pip install -q datasets scikit-learn evaluate

In [None]:
%pip install -U transformers

In [1]:
# Import required libraries
from datasets import load_dataset
from transformers import CamembertTokenizer, CamembertForSequenceClassification, TrainingArguments, Trainer




In [2]:
# 1Ô∏è‚É£ Load Dataset
file_name = "final_dataset.json"
dataset = load_dataset("json", data_files=file_name)
dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'title', 'author', 'date', 'url', 'category_raw', 'media', 'image_url', 'tags', 'scraped_at'],
        num_rows: 2757
    })
})

In [3]:
from datasets import ClassLabel

# Get unique labels from the 'label' column to define the ClassLabel
unique_labels = sorted(list(set(dataset["train"]["label"]))) # Ensure a consistent order
class_labels = ClassLabel(names=unique_labels)

# Cast the 'label' column in the dataset to ClassLabel type
dataset = dataset.cast_column("label", class_labels)

# Now perform the train-test split with stratification
dt = dataset["train"].train_test_split(test_size=0.2, stratify_by_column="label")
dt

Casting the dataset:   0%|          | 0/2757 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'title', 'author', 'date', 'url', 'category_raw', 'media', 'image_url', 'tags', 'scraped_at'],
        num_rows: 2205
    })
    test: Dataset({
        features: ['text', 'label', 'title', 'author', 'date', 'url', 'category_raw', 'media', 'image_url', 'tags', 'scraped_at'],
        num_rows: 552
    })
})

In [None]:
train_dataset = dt["train"]
eval_dataset = dt["test"]

In [None]:
# 2Ô∏è‚É£ Load tokenizer
model_checkpoint = "camembert-base"
tokenizer = CamembertTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

In [None]:
# Example input string
text = "Le marchier boursier a subi des hausses record aujourd'hui"

# Tokenize (convert to input IDs and attention mask)
inputs = tokenizer(text, return_tensors="pt")

# Show tokenized input
print("üì• Tokenized input:")
for k, v in inputs.items():
    print(f"{k}: {v.shape} -> {v}")


üì• Tokenized input:
input_ids: torch.Size([1, 15]) -> tensor([[    5,    54, 15842,   946, 24206,    33,  4857,    20,  2876,    10,
          5210,   405,    11,   265,     6]])
attention_mask: torch.Size([1, 15]) -> tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [None]:
# 3Ô∏è‚É£ Tokenize data
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length")

train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2205 [00:00<?, ? examples/s]

Map:   0%|          | 0/552 [00:00<?, ? examples/s]

In [None]:
# Load model
num_labels = dt["train"].features["label"].num_classes
model = CamembertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)


# for param in model.base_model.parameters():
#     param.requires_grad = False #768*4 params train

# # Unfreeze only the pooler layer
# for param in model.bert.pooler.parameters():
#     param.requires_grad = True

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return {
        "Total": total_params,
        "Trainable": trainable_params,
        "Frozen": total_params - trainable_params
    }

# Example:
param_counts = count_parameters(model)
print("Model Parameter Counts:")
for k, v in param_counts.items():
    print(f"{k}: {v:,}")


Model Parameter Counts:
Total: 109,485,316
Trainable: 593,668
Frozen: 108,891,648


In [None]:
outputs = model(**inputs)
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0493, -0.0359,  0.0021, -0.0432,  0.0477, -0.0475, -0.0588]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [None]:
import evaluate
import numpy as np

# Load the evaluation metric
accuracy_metric = evaluate.load("accuracy")

# Define compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# 6Ô∏è‚É£ Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",           # üîÅ Log training loss every N steps
    logging_steps=10,                   # ‚è± Logs every 10 steps
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)


In [None]:
# 7Ô∏è‚É£ Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
# 8Ô∏è‚É£ Train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.9049,0.755547,0.818841
2,0.5313,0.561898,0.815217
3,0.2611,0.559832,0.836957
4,0.2309,0.5326,0.856884
5,0.1531,0.547719,0.851449


TrainOutput(global_step=690, training_loss=0.5328685014144234, metrics={'train_runtime': 1188.2567, 'train_samples_per_second': 9.278, 'train_steps_per_second': 0.581, 'total_flos': 2900929610880000.0, 'train_loss': 0.5328685014144234, 'epoch': 5.0})

### Classification Report Eval

In [None]:
from sklearn.metrics import classification_report

label_names = ["Sport", "Autres", "√âconomie", "Politique", "Culture", "S√©curit√©", "Sant√©"]

pred_output = trainer.predict(eval_dataset)
preds = pred_output.predictions.argmax(axis=-1)
labels = pred_output.label_ids

In [None]:
print(classification_report(labels, preds, target_names=label_names))

              precision    recall  f1-score   support

       Sport       0.86      0.76      0.81       120
      Autres       0.87      0.91      0.89        67
    √âconomie       0.79      0.72      0.76        69
   Politique       0.86      0.81      0.83        31
     Culture       0.99      0.98      0.98       146
    S√©curit√©       0.70      0.79      0.74        47
       Sant√©       0.77      0.92      0.84        72

    accuracy                           0.86       552
   macro avg       0.83      0.84      0.83       552
weighted avg       0.86      0.86      0.86       552



### Manual tests

In [4]:
label_map = dt["train"].features["label"].int2str  # maps 0 ‚Üí 'World', etc.
label_map

<bound method ClassLabel.int2str of ClassLabel(names=['Autres', 'Culture', 'Politique', 'Sant√©', 'Sport', 'S√©curit√©', '√âconomie'], id=None)>

In [9]:
label_map(1)

'Culture'

In [None]:
import torch

def classify(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    inputs={k:v.to(model.device) for k,v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    predicted_class_id = outputs.logits.argmax(dim=-1).item()
    return label_map(predicted_class_id)

In [None]:
# content = """
# ùêÇùê®ùêÆùê©ùêû ùêùùêÆ ùê¶ùê®ùêßùêùùêû ùêîùüèùüï ùêêùêöùê≠ùêöùê´ : ùê•ùêûùê¨ ùêûÃÅùê≠ùêöùê•ùê®ùêßùê¨ ùêúùê´ùêûÃÅùêûùêßùê≠ ùê•‚Äôùêûùê±ùê©ùê•ùê®ùê¢ùê≠ ùêúùê®ùêßùê≠ùê´ùêû ùê•ùêö ùêåùêöùêßùêßùê¨ùêúùê°ùêöùêüùê≠ ùêûùê≠ ùêüùê¢ùê•ùêûùêßùê≠ ùêûùêß ùê°ùêÆùê¢ùê≠ùê¢ùêûÃÄùê¶ùêûùê¨
# Les √âtalons U17 ont cr√©√© le samedi 15 novembre 2025, la sensation en s‚Äôimposant 1-0 face √† la Mannschaft d‚ÄôAllemagne, d√©crochant ainsi leur billet pour les huiti√®mes de finale de la coupe du monde U17.
# Les √âtalons devront d√©sormais se tourner vers leur prochain d√©fi : l‚ÄôOuganda, tombeur un peu plus t√¥t dans l‚Äôapr√®s-midi du S√©n√©gal, √©galement sur le score de 1-0. La rencontre est programm√©e pour le mardi 18 novembre 2025.
# DCRP/MSJE
# """

# content = """
# 5e √©dition de la Coupe des ambassades : Le Canada √©trille l‚ÄôAlg√©rie dans le match d‚Äôouverture (8-0)
# La 5e √©dition de la Coupe des ambassades en football a √©t√© lanc√©e dans l‚Äôapr√®s-midi de ce samedi 15 novembre 2025 √† Ouagadougou, par la secr√©taire g√©n√©rale du minist√®re des Sports, Colette Ou√©draogo, repr√©sentant le ministre Roland Somda. En match d‚Äôouverture, le Canada a √©t√© sans piti√© face √† l‚ÄôAlg√©rie, √©trill√©e 8-0. Dix-sept √©quipes (ambassades accr√©dit√©es au Burkina Faso) r√©parties en quatre poules prennent part √† cette √©dition 2025 de la Coupe des ambassades.

# """

content = """

"""

print(classify(content))       # ‚Üí 'Sport'

S√©curit√©


In [None]:
repo_name = "Minervus00/camembert-news-classifier"

# ‚úÖ Save the model and tokenizer
model.save_pretrained(repo_name)
tokenizer.save_pretrained(repo_name)

('Minervus00/camembert-news-classifier/tokenizer_config.json',
 'Minervus00/camembert-news-classifier/special_tokens_map.json',
 'Minervus00/camembert-news-classifier/sentencepiece.bpe.model',
 'Minervus00/camembert-news-classifier/added_tokens.json')

In [None]:
# from google.colab import userdata
# HF_TOKEN = userdata.get('HF_WRITE')

In [None]:
# ‚úÖ Push model to Hugging Face Hub
model.push_to_hub(repo_name, token=HF_TOKEN)
tokenizer.push_to_hub(repo_name, token=HF_TOKEN)

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ila88m1/model.safetensors:   0%|          |  553kB /  443MB            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...g/sentencepiece.bpe.model: 100%|##########|  811kB /  811kB            

CommitInfo(commit_url='https://huggingface.co/Minervus00/camembert-news-classifier/commit/75d7b904304e0da931c2b83630bcce7f8cf003a6', commit_message='Upload tokenizer', commit_description='', oid='75d7b904304e0da931c2b83630bcce7f8cf003a6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Minervus00/camembert-news-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='Minervus00/camembert-news-classifier'), pr_revision=None, pr_num=None)

### Isolated test

In [None]:
# from google.colab import userdata
# HF_TOKEN = userdata.get('HF_TOKEN')

In [7]:
from transformers import CamembertForSequenceClassification, CamembertTokenizer
import torch

# ‚úÖ Load model & tokenizer from Hugging Face
repo_name = "Minervus00/camembert-news-classifier"
model = CamembertForSequenceClassification.from_pretrained(repo_name)
tokenizer = CamembertTokenizer.from_pretrained(repo_name)

model.safetensors:  14%|#4        | 62.9M/443M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
def classify(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    inputs={k:v.to(model.device) for k,v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    predicted_class_id = outputs.logits.argmax(dim=-1).item()
    return label_map(predicted_class_id)

In [10]:
import json

with open("../data/processed/database.json", "r") as f:
    analysis_dic = json.load(f)

analysis_dic

{'articles': [{'id': '6a4cb3874fb1e98e0c0849c8c456ce33',
   'media': 'FasoPresse',
   'titre': 'Op√©ration caisses vides : Paul Kaba Thieba s‚Äôen prend au SYNTSHA',
   'date': '2018-06-12 09:42:00',
   'url': 'https://www.fasopresse.net/politique/6143-operation-caisses-vides-paul-kaba-thieba-sen-prend-au-syntsha',
   'contenu': 'Le Premier ministre Paul Kaba Thieba a qualifi√©, ce lundi 11 juin 2018, ¬´ d‚Äôinfond√©e et d‚Äôill√©gale ¬ª l‚Äôop√©ration caisses vides, lanc√©e par le Syndicat des travailleurs de la sant√© humaine et animale (SYNTSHA).\n\nDu lundi 11 juin au lundi 18 juin 2018, les agents de sant√© assureront les prestations m√©dicales sans encaisser la contrepartie financi√®re.\n\nEt ce, pour r√©pondre √† l‚Äôappel de leur syndicat, le SYNTSHA. Cette nouvelle forme de lutte n‚Äôest pas du go√ªt du gouvernement burkinab√© qui ¬´ ne la c...',
   'categorie': 'Sant√©',
   'engagement': {'likes': 0, 'partages': 0, 'commentaires': 0},
   'sensible': False,
   'toxicite_score'

In [12]:
# import datetime
from collections import defaultdict

# -------------------------------------------------------
# Define score influence formula
# You can change this to whatever formula you prefer
# -------------------------------------------------------
def compute_score_influence(nb_articles, engagement_total):
    if nb_articles == 0:
        return 0.0
    return 0.7 * (engagement_total / 1000) + 0.3 * nb_articles


# -------------------------------------------------------
# Compute stats for each media
# -------------------------------------------------------
def compute_media_stats(data):

    # Dictionary to accumulate results
    stats = defaultdict(lambda: {
        "nb_articles": 0,
        "engagement_total": 0,
        "score_influence": 0.0,
        "rang": None,
        # "actif_90j": False
    })

    # today = datetime.date.today()
    # delta_90 = datetime.timedelta(days=90)

    for article in data["articles"]:
        media = article["media"]

        # Update counters
        stats[media]["nb_articles"] += 1
        stats[media]["engagement_total"] += (
            article["engagement"]["likes"]
            + article["engagement"]["partages"]
            + article["engagement"]["commentaires"]
        )

        # Check if article is within last 90 days
        # try:
        #     article_date = datetime.datetime.strptime(article["date"], "%Y-%m-%d").date()
        # except:
        #     # If format is different, adapt accordingly
        #     article_date = datetime.date.fromisoformat(article["date"])

        # if today - article_date <= delta_90:
        #     stats[media]["actif_90j"] = True

    # -------------------------------------------------------
    # Compute score influence
    # -------------------------------------------------------
    for media, info in stats.items():
        info["score_influence"] = compute_score_influence(
            info["nb_articles"],
            info["engagement_total"]
        )

    # -------------------------------------------------------
    # Ranking by score_influence (descending)
    # -------------------------------------------------------
    ranked = sorted(stats.items(), key=lambda x: x[1]["score_influence"], reverse=True)

    for rank, (media, info) in enumerate(ranked, start=1):
        info["nom"] = media
        info["rang"] = rank

    # Save results back into data["medias"]
    data["medias"] = [info for media, info in ranked]

    return data


# -------------------------------------------------------
# Example usage
# -------------------------------------------------------
# data = { "articles": [...], "medias": [] }
data = compute_media_stats(analysis_dic)
# print(json.dumps(data, indent=2, ensure_ascii=False))
data["medias"]

[{'nb_articles': 2800,
  'engagement_total': 10589,
  'score_influence': 847.4123,
  'rang': 1,
  'nom': 'Lefaso.net'},
 {'nb_articles': 1483,
  'engagement_total': 0,
  'score_influence': 444.9,
  'rang': 2,
  'nom': 'Sidwaya'},
 {'nb_articles': 795,
  'engagement_total': 0,
  'score_influence': 238.5,
  'rang': 3,
  'nom': 'AIB Media'},
 {'nb_articles': 366,
  'engagement_total': 0,
  'score_influence': 109.8,
  'rang': 4,
  'nom': "L'Observateur Paalga"},
 {'nb_articles': 50,
  'engagement_total': 0,
  'score_influence': 15.0,
  'rang': 5,
  'nom': 'FasoPresse'},
 {'nb_articles': 39,
  'engagement_total': 0,
  'score_influence': 11.7,
  'rang': 6,
  'nom': 'Burkina24'}]

In [14]:
import json

with open("../data/processed/final_db1.json", "w") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)