# Climate change Text Classification

In [1]:
import pandas as pd
import numpy as np
import json
import urllib.request # Import the urllib.request module

# URL of the JSONL file
url = "https://www.sustainablefinance.uzh.ch/dam/jcr:df02e448-baa1-4db8-921a-58507be4838e/climate-fever-dataset-r1.jsonl"

# Function to read the JSONL file line by line
def read_jsonl_from_url(url):
    data = []
    with urllib.request.urlopen(url) as response:
        for line in response:
            data.append(json.loads(line.decode('utf-8')))
    return data

# Load the data into a pandas DataFrame
df = pd.DataFrame(read_jsonl_from_url(url))

In [2]:
# Display the DataFrame
df.head()

Unnamed: 0,claim_id,claim,claim_label,evidences
0,0,Global warming is driving polar bears toward e...,SUPPORTS,[{'evidence_id': 'Extinction risk from global ...
1,5,The sun has gone into ‘lockdown’ which could c...,SUPPORTS,"[{'evidence_id': 'Famine:386', 'evidence_label..."
2,6,The polar bear population has been growing.,REFUTES,"[{'evidence_id': 'Polar bear:1332', 'evidence_..."
3,9,Ironic' study finds more CO2 has slightly cool...,REFUTES,"[{'evidence_id': 'Atmosphere of Mars:131', 'ev..."
4,10,Human additions of CO2 are in the margin of er...,REFUTES,[{'evidence_id': 'Carbon dioxide in Earth's at...


In [3]:
print(df.describe())

       claim_id                                              claim  \
count      1535                                               1535   
unique     1535                                               1535   
top        3134  Over the last decade, heatwaves are five times...   
freq          1                                                  1   

       claim_label                                          evidences  
count         1535                                               1535  
unique           4                                               1534  
top       SUPPORTS  [{'evidence_id': 'Greenland ice sheet:43', 'ev...  
freq           654                                                  2  


In [4]:
print(df.shape)

(1535, 4)


In [5]:
df.value_counts("claim_label")

claim_label
SUPPORTS           654
NOT_ENOUGH_INFO    474
REFUTES            253
DISPUTED           154
Name: count, dtype: int64

#Start your project here.

# Question 1 : Divide the data in a balanced train set (85%) and a validation set (15%).

In [6]:
# Proportion pour l'ensemble de validation
val_ratio = 0.15

# Répartition équilibrée
train_indices = []
val_indices = []

for label in df["claim_label"].unique():
    class_indices = df[df["claim_label"] == label].index.values
    np.random.shuffle(class_indices)
    val_size = int(len(class_indices) * val_ratio)
    val_indices.extend(class_indices[:val_size])
    train_indices.extend(class_indices[val_size:])

train_df = df.loc[train_indices].reset_index(drop=True)
val_df = df.loc[val_indices].reset_index(drop=True)

print("Proportions dans l'ensemble d'entraînement:")
print(train_df["claim_label"].value_counts(normalize=True))

print("\nProportions dans l'ensemble de validation:")
print(val_df["claim_label"].value_counts(normalize=True))


Proportions dans l'ensemble d'entraînement:
claim_label
SUPPORTS           0.425727
NOT_ENOUGH_INFO    0.308576
REFUTES            0.165391
DISPUTED           0.100306
Name: proportion, dtype: float64

Proportions dans l'ensemble de validation:
claim_label
SUPPORTS           0.427948
NOT_ENOUGH_INFO    0.310044
REFUTES            0.161572
DISPUTED           0.100437
Name: proportion, dtype: float64


# Question 2 : train a simple RNN model to predict column “claim_label” as a function of column “claim” without too much overfitting. 
> You will not have enough time nor data to create a test set.

In [7]:
import tensorflow as tf

# Vérification de la disponibilité du GPU
print("GPU disponible :", tf.config.list_physical_devices('GPU'))

# Optionnel : Limiter la mémoire GPU si nécessaire
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Alloue dynamiquement la mémoire GPU
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)


# Préparation des données
label_mapping = {label: idx for idx, label in enumerate(train_df["claim_label"].unique())}
train_df["claim_label"] = train_df["claim_label"].map(label_mapping)
val_df["claim_label"] = val_df["claim_label"].map(label_mapping)

# Tokenisation et padding
vocab_size = 10000
max_length = 100
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["claim"])

train_sequences = tokenizer.texts_to_sequences(train_df["claim"])
val_sequences = tokenizer.texts_to_sequences(val_df["claim"])

train_padded = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_length, padding="post")
val_padded = tf.keras.preprocessing.sequence.pad_sequences(val_sequences, maxlen=max_length, padding="post")

train_labels = train_df["claim_label"].values
val_labels = val_df["claim_label"].values

# Construction du modèle
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, 16, input_length=max_length),
    tf.keras.layers.SimpleRNN(32, return_sequences=True),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.SimpleRNN(16),
    tf.keras.layers.Dense(len(label_mapping), activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

history = model.fit(train_padded, train_labels, validation_data=(val_padded, val_labels), epochs=20, batch_size=32, callbacks=[early_stopping])




Epoch 1/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 44ms/step - accuracy: 0.3310 - loss: 1.3575 - val_accuracy: 0.4279 - val_loss: 1.2601
Epoch 2/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.4167 - loss: 1.2700 - val_accuracy: 0.4279 - val_loss: 1.2513
Epoch 3/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.4378 - loss: 1.2472 - val_accuracy: 0.4192 - val_loss: 1.3437
Epoch 4/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.5741 - loss: 1.1173 - val_accuracy: 0.4061 - val_loss: 1.3910
Epoch 5/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.6545 - loss: 0.9224 - val_accuracy: 0.4105 - val_loss: 1.4712


Affichage des performances du modèle :

In [8]:
# Évaluation de l'accuracy sur l'ensemble de validation
val_loss, val_accuracy = model.evaluate(val_padded, val_labels, verbose=0)
print(f"Validation Accuracy: {100*val_accuracy:.4f}%")
#print(f"Validation Loss: {val_loss:.4f}")


Validation Accuracy: 42.7948%


# Question 3 : fine-tune (https://huggingface.co/docs/transformers/training) a pretrained model from huggingface.co, in two slightly different ways, to predict column “claim_label” as a function of column “claim” without too much overfitting. 
> You will not have enough time nor data to create a test set. You will need to tokenize your dataset.

In [11]:
import pandas as pd
import tensorflow as tf
from transformers import AutoTokenizer

# Charger le tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokeniser les données
def tokenize_function(claim):
    return tokenizer(
        claim,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="tf",
    )

# Appliquer la tokenisation à chaque texte dans la colonne "claim"
train_encodings = train_df["claim"].apply(tokenize_function)
val_encodings = val_df["claim"].apply(tokenize_function)

# Convertir les encodages en format utilisable par TensorFlow (dict de tenseurs)
def format_encodings(encodings):
    return {key: tf.constant([enc[key].numpy() for enc in encodings]) for key in encodings[0]}

train_encodings = format_encodings(train_encodings)
val_encodings = format_encodings(val_encodings)


In [17]:
# Convertir les labels en tenseurs
train_labels = tf.convert_to_tensor(train_df["claim_label"].values, dtype=tf.int32)
val_labels = tf.convert_to_tensor(val_df["claim_label"].values, dtype=tf.int32)

# Charger un modèle pré-entraîné pour la classification
model = TFAutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(train_df["claim_label"].unique())
)

# Compiler le modèle
model.compile(
    optimizer="adam",  # Utilisation du nom directement
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

try:
    # Entraîner le modèle
    with tf.device('/GPU:0'):
        # Placez ici le code d'entraînement du modèle
        history = model.fit(
            {"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"]},
            train_labels,
            validation_data=(
                {"input_ids": val_encodings["input_ids"], "attention_mask": val_encodings["attention_mask"]},
                val_labels
            ),
            epochs=20,  # Vous pouvez augmenter pour plus d'entraînement
            batch_size=32
        )
except:
    history = model.fit(
            {"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"]},
            train_labels,
            validation_data=(
                {"input_ids": val_encodings["input_ids"], "attention_mask": val_encodings["attention_mask"]},
                val_labels
            ),
            epochs=5,  # Vous pouvez augmenter pour plus d'entraînement
            batch_size=32
        )


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.






In [None]:
# Geler les couches de base (le modèle pré-entraîné)
for layer in model.layers[:-1]:  # Gèle toutes les couches sauf la dernière
    layer.trainable = False

# Compiler à nouveau le modèle
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=["accuracy"]
)

# Entraîner le modèle
history_partial_fine_tuning = model.fit(
    {"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"]},
    train_labels,
    validation_data=(
        {"input_ids": val_encodings["input_ids"], "attention_mask": val_encodings["attention_mask"]},
        val_labels
    ),
    epochs=3,  # Limité pour éviter l'overfitting
    batch_size=16
)
