In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import plotly.express as px



In [2]:
df = pd.read_csv("hf://datasets/readerbench/fakenews-climate-fr/fake-fr.csv")
df.head()

Unnamed: 0,Text,Label
0,\nUne réflexion collective qui est aussi quoi ...,fake
1,\nDès l’élaboration de la Convention-cadre des...,true
2,"\n""On peut parler d'invasion, d'invasion génér...",true
3,"\nLa Rochelle, le 14 mars 2008. Un navire en p...",biased
4,\nJ’ai eu le plaisir de visiter le Japon il n’...,fake


In [4]:
df.columns


Index(['Text', 'Label'], dtype='object')

In [None]:
import re



FR_ABBREV = [
    "M", "Mme", "Mlle", "Dr", "Pr", "Prof",
    "St", "Ste",          
    "etc", "p.ex", "ex", "cf", "c.-à-d", "n°",
    "av", "apr",          
]


ABBR_PATTERN = r"|".join(map(re.escape, FR_ABBREV))

def split_fr_sentences(text: str) -> list[str]:
    if not isinstance(text, str) or not text.strip():
        return []

    protected = re.sub(rf"\b({ABBR_PATTERN})\.", r"\1<DOT>", text)

    
    parts = re.split(r"[.!?]+\s*", protected)

   
    sentences = []
    for p in parts:
        p = p.replace("<DOT>", ".").strip()
        
        if p and not re.fullmatch(r"[\W_]+", p):
            sentences.append(p)

    return sentences

In [18]:
sentences = []
labels = []

for text, label in df[["Text", "Label"]].itertuples(index=False):
    for sentence in split_fr_sentences(text):
        sentences.append(sentence)
        labels.append(label)

df_sent = pd.DataFrame({
    "text": sentences,
    "label": labels
})


In [20]:
df_sent.sample(5, random_state=0)


Unnamed: 0,text,label
49378,"Pour la Russie, cela signifierait un allongeme...",true
26995,statistiques,true
46511,"Ces trois dernières années, les négociateurs a...",true
9580,Une canicule de type « une fois tous les 50 an...,true
66973,"ont trouvé « seulement 0,03 pour cent d’approb...",fake


In [21]:
df_sent.to_csv("fakenews_data_preprocessed.csv", index=False)

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

X = df["Text"].astype(str)
y = df["Label"].astype(str).str.strip().str.lower()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(lowercase=True, max_features=5000)),
    ("clf", LogisticRegression(max_iter=2000))
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

      biased       0.73      0.13      0.22        63
        fake       0.74      0.67      0.70       127
        true       0.77      0.93      0.84       297

    accuracy                           0.76       487
   macro avg       0.74      0.58      0.59       487
weighted avg       0.75      0.76      0.72       487

[[  8  13  42]
 [  0  85  42]
 [  3  17 277]]


In [26]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

svm_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(lowercase=True, max_features=5000)),
    ("clf", LinearSVC())
])

svm_pipe.fit(X_train, y_train)
y_pred = svm_pipe.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

      biased       0.62      0.25      0.36        63
        fake       0.73      0.78      0.75       127
        true       0.82      0.90      0.86       297

    accuracy                           0.78       487
   macro avg       0.72      0.64      0.66       487
weighted avg       0.77      0.78      0.77       487



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

svm_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(lowercase=True)),
    ("clf", LinearSVC())
])

param_grid = {
    # TF-IDF
    "tfidf__max_features": [2000, 5000, 10000, None],
    "tfidf__ngram_range": [(1,1), (1,2)],
    "tfidf__min_df": [1, 2, 5],
    "tfidf__max_df": [0.8, 0.9, 1.0],
    "tfidf__sublinear_tf": [True, False],

    # SVM linéaire
    "clf__C": [0.1, 0.5, 1, 2, 5],
    "clf__class_weight": [None, "balanced"],
}

grid = GridSearchCV(
    svm_pipe,
    param_grid=param_grid,
    scoring="f1_macro",   
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV score:", grid.best_score_)

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits
[CV] END clf__C=0.1, clf__class_weight=None, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=1, tfidf__ngram_range=(1, 1), tfidf__sublinear_tf=True; total time=   1.3s
[CV] END clf__C=0.1, clf__class_weight=None, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=1, tfidf__ngram_range=(1, 1), tfidf__sublinear_tf=False; total time=   1.4s
[CV] END clf__C=0.1, clf__class_weight=None, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=1, tfidf__ngram_range=(1, 1), tfidf__sublinear_tf=True; total time=   1.4s
[CV] END clf__C=0.1, clf__class_weight=None, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=1, tfidf__ngram_range=(1, 1), tfidf__sublinear_tf=False; total time=   1.4s
[CV] END clf__C=0.1, clf__class_weight=None, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=1, tfidf__ngram_range=(1, 1), tfidf__sublinear_tf=True; total time=   1.3s
[CV] END clf__C=0.1, clf__class_weight

In [32]:
grid.best_params_

{'clf__C': 0.5,
 'clf__class_weight': 'balanced',
 'tfidf__max_df': 0.8,
 'tfidf__max_features': 10000,
 'tfidf__min_df': 5,
 'tfidf__ngram_range': (1, 1),
 'tfidf__sublinear_tf': False}

In [47]:
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

fig = px.imshow(confusion_matrix(y_test, y_pred), 
          text_auto=True, 
          color_continuous_scale="Blues")

fig.update_xaxes(title_text="Prédictions")
fig.update_yaxes(title_text="Valeurs réelles")
fig.update_layout(showlegend=False)
fig.update_layout(width=600, height=500)

fig.show()


              precision    recall  f1-score   support

      biased       0.49      0.37      0.42        63
        fake       0.75      0.81      0.78       127
        true       0.84      0.86      0.85       297

    accuracy                           0.78       487
   macro avg       0.69      0.68      0.68       487
weighted avg       0.77      0.78      0.77       487



In [34]:
new_texts = [
    "Le changement climatique est un complot inventé par les scientifiques.",
    "Les émissions de CO2 augmentent la température moyenne globale."
]

predictions = best_model.predict(new_texts)

for text, label in zip(new_texts, predictions):
    print(f"TEXTE: {text}\n→ PRÉDICTION: {label}\n")


TEXTE: Le changement climatique est un complot inventé par les scientifiques.
→ PRÉDICTION: true

TEXTE: Les émissions de CO2 augmentent la température moyenne globale.
→ PRÉDICTION: true



In [29]:
from sklearn.naive_bayes import MultinomialNB

nb_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(lowercase=True, max_features=5000)),
    ("clf", MultinomialNB())
])

nb_pipe.fit(X_train, y_train)
y_pred = nb_pipe.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

      biased       0.00      0.00      0.00        63
        fake       0.85      0.40      0.55       127
        true       0.68      0.98      0.80       297

    accuracy                           0.70       487
   macro avg       0.51      0.46      0.45       487
weighted avg       0.64      0.70      0.63       487



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [30]:
from sklearn.ensemble import RandomForestClassifier

rf_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(lowercase=True, max_features=5000)),
    ("clf", RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1))
])

rf_pipe.fit(X_train, y_train)
y_pred = rf_pipe.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

      biased       0.00      0.00      0.00        63
        fake       0.68      0.51      0.59       127
        true       0.72      0.95      0.82       297

    accuracy                           0.71       487
   macro avg       0.47      0.49      0.47       487
weighted avg       0.62      0.71      0.65       487



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
