In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Dataset 2

In [2]:
dataset2 = pd.read_csv('dalas/project/githubProject/ConflictPrediction/dataset2/dataset2_before_splitting.csv')

In [3]:
## On repasse le type des colonnes 'was_interpolated' en int au lieu de bool

cols_to_convert = [
    c for c in dataset2.columns 
    if "_was_interpolated" in c
]

for col in cols_to_convert:
    dataset2[col] = dataset2[col].astype(int)



In [None]:
#dataset2 = dataset2.drop(columns=['Unnamed: 0'])
#dataset2.to_csv('dalas/project/githubProject/ConflictPrediction/dataset2/dataset2_before_splitting.csv', index=False)

In [8]:
## Code pour split le dataset 1 en train/test 90-10 %

X = dataset2.drop(columns=['war'])
y = dataset2['war']

# Split train/test final
X_train, X_test, y_train, y_test = train_test_split(
    X,y,
    test_size=0.1,# 10% pour test
    stratify=y, #pour garder la répartition des classes
    random_state=42 #pour mettre une random seed fixe
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (6610, 166)
Test shape: (735, 166)


In [12]:
from sklearn.neural_network import MLPClassifier
### Traitement avec un MLPClassifier 

mlp = MLPClassifier(# paramètres du modèle à ajuster
    hidden_layer_sizes=(128, 64),
    activation='relu',
    solver='adam',
    max_iter=200,
    random_state=42
)
F1ScoreListMLP = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i,(train_index,val_index) in enumerate(skf.split(X_train, y_train)):

    # On récupère les sous-datasets d'entraînement et de validation
    X_tr = X_train.iloc[train_index]
    X_val=  X_train.iloc[val_index]
    y_tr  = y_train.iloc[train_index] 
    y_val = y_train.iloc[val_index]

    # Entraînement du modèle MLP
    mlp.fit(X_tr, y_tr)
    y_pred = mlp.predict(X_val)



    print(f"Fold {i}:")

    # Métriques MLP
    accMLP = accuracy_score(y_val, y_pred)
    precMLP = precision_score(y_val, y_pred)
    recMLP = recall_score(y_val, y_pred)
    f1MLP = f1_score(y_val, y_pred)

    
    F1ScoreListMLP.append(f1MLP)
    

    print("MLP Metrics:")
    print(f"Accuracy : {accMLP:.4f}")
    print(f"Precision: {precMLP:.4f}")
    print(f"Recall   : {recMLP:.4f}")
    print(f"F1-score : {f1MLP:.4f}")

print("Average F1-score MLP over 5 folds:", np.mean(F1ScoreListMLP))
print("Standard Deviation of F1-score across all folds for MLP: " + str(np.std(F1ScoreListMLP)))


Fold 0:
MLP Metrics:
Accuracy : 0.8903
Precision: 0.8660
Recall   : 0.8612
F1-score : 0.8636
Fold 1:
MLP Metrics:
Accuracy : 0.9017
Precision: 0.8711
Recall   : 0.8874
F1-score : 0.8792
Fold 2:
MLP Metrics:
Accuracy : 0.9274
Precision: 0.9024
Recall   : 0.9193
F1-score : 0.9108
Fold 3:
MLP Metrics:
Accuracy : 0.9032
Precision: 0.8872
Recall   : 0.8705
F1-score : 0.8788
Fold 4:
MLP Metrics:
Accuracy : 0.8464
Precision: 0.8300
Recall   : 0.7786
F1-score : 0.8035
Average F1-score MLP over 5 folds: 0.8671658604473531
Standard Deviation of F1-score across all folds for MLP: 0.035354569257847926


### SKF avec K =10 (on garde le même skf pour avoir la meme répartition)

In [20]:
skf_10 = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) # On garde le même skf pour la comparaison 

In [None]:


## Modèle XGBoost

F1ScoreListXGB = []


for i,(train_index,val_index) in enumerate(skf_10.split(X_train, y_train)):

    # On récupère les sous-datasets d'entraînement et de validation
    X_tr = X_train.iloc[train_index]
    X_val=  X_train.iloc[val_index]
    y_tr  = y_train.iloc[train_index] 
    y_val = y_train.iloc[val_index]

    # Entrainement du modèle XGBoost
     
    modelXgb = xgb.XGBClassifier(# Hyperparamètres à revoir
            n_estimators=100,
            max_depth=5,
            learning_rate=1,
            random_state=42
        )
    # Entraînement
    modelXgb.fit(X_tr, y_tr)
    y_predXgb = modelXgb.predict(X_val)
    print(f"Fold {i}:")

    # Métriques XGBoost
    accXGB = accuracy_score(y_val, y_predXgb)
    precXGB = precision_score(y_val, y_predXgb)
    recXGB = recall_score(y_val, y_predXgb)
    f1XGB = f1_score(y_val, y_predXgb)

    F1ScoreListXGB.append(f1XGB)

    print("XGBoost Metrics:")
    #print(f"Accuracy : {accXGB:.4f}")
    #print(f"Precision: {precXGB:.4f}")
    #print(f"Recall   : {recXGB:.4f}")
    print(f"F1-score : {f1XGB:.4f}")

print("\nAverage F1-score across all folds for XGBoost: " + str(np.mean(F1ScoreListXGB)))
print ("Standard Deviation of F1-score across all folds for XGBoost: " + str(np.std(F1ScoreListXGB)))



Fold 0:
XGBoost Metrics:
F1-score : 0.8885
Fold 1:
XGBoost Metrics:
F1-score : 0.8906
Fold 2:
XGBoost Metrics:
F1-score : 0.8854
Fold 3:
XGBoost Metrics:
F1-score : 0.8794
Fold 4:
XGBoost Metrics:
F1-score : 0.9052
Fold 5:
XGBoost Metrics:
F1-score : 0.8945
Fold 6:
XGBoost Metrics:
F1-score : 0.9014
Fold 7:
XGBoost Metrics:
F1-score : 0.8919
Fold 8:
XGBoost Metrics:
F1-score : 0.8606
Fold 9:
XGBoost Metrics:
F1-score : 0.8915

Average F1-score across all folds for XGBoost: 0.8889030012666824
Standard Deviation of F1-score across all folds for XGBoost: 0.011740895149880314


In [21]:
## Modèle CatBoost

from catboost import CatBoostClassifier

F1ScoreListCB = []



for i, (train_index, val_index) in enumerate(skf_10.split(X_train, y_train)):

    X_tr = X_train.iloc[train_index]
    X_val = X_train.iloc[val_index]
    y_tr = y_train.iloc[train_index]
    y_val = y_train.iloc[val_index]

    # CatBoostClassifier
    modelCB = CatBoostClassifier( # A revoir
        iterations=300,        
        learning_rate=0.05,
        depth=6,
        loss_function='Logloss',
        verbose=False,         
        random_seed=42
    )

    modelCB.fit(X_tr, y_tr)

    y_predCB = modelCB.predict(X_val)

    print(f"Fold {i}:")
    accCB = accuracy_score(y_val, y_predCB)
    precCB = precision_score(y_val, y_predCB)
    recCB = recall_score(y_val, y_predCB)
    f1CB = f1_score(y_val, y_predCB)

    F1ScoreListCB.append(f1CB)

    print("CatBoost Metrics:")
    #print(f"Accuracy : {accCB:.4f}")
    #print(f"Precision: {precCB:.4f}")
    #print(f"Recall   : {recCB:.4f}")
    print(f"F1-score : {f1CB:.4f}")

print("\nAverage F1-score CatBoost:", np.mean(F1ScoreListCB))
print("Std CatBoost:", np.std(F1ScoreListCB))


Fold 0:
CatBoost Metrics:
F1-score : 0.8829
Fold 1:
CatBoost Metrics:
F1-score : 0.9125
Fold 2:
CatBoost Metrics:
F1-score : 0.9231
Fold 3:
CatBoost Metrics:
F1-score : 0.9087
Fold 4:
CatBoost Metrics:
F1-score : 0.9056
Fold 5:
CatBoost Metrics:
F1-score : 0.9052
Fold 6:
CatBoost Metrics:
F1-score : 0.9062
Fold 7:
CatBoost Metrics:
F1-score : 0.9049
Fold 8:
CatBoost Metrics:
F1-score : 0.8713
Fold 9:
CatBoost Metrics:
F1-score : 0.8941

Average F1-score CatBoost: 0.9014599032516966
Std CatBoost: 0.014202074634989919


In [None]:
#GradientBoostingClassifier

from sklearn.ensemble import GradientBoostingClassifier

F1ScoreListGB = []

for i, (train_index, val_index) in enumerate(skf_10.split(X_train, y_train)):

    X_tr = X_train.iloc[train_index]
    X_val = X_train.iloc[val_index]
    y_tr = y_train.iloc[train_index]
    y_val = y_train.iloc[val_index]

    modelGB = GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=3
    )

    modelGB.fit(X_tr, y_tr)
    y_predGB = modelGB.predict(X_val)

    print(f"Fold {i}:")
    accGB = accuracy_score(y_val, y_predGB)
    precGB = precision_score(y_val, y_predGB)
    recGB = recall_score(y_val, y_predGB)
    f1GB = f1_score(y_val, y_predGB)

    F1ScoreListGB.append(f1GB)

    print("GradientBoosting Metrics:")
    #print(f"Accuracy : {accGB:.4f}")
    #print(f"Precision: {precGB:.4f}")
    #print(f"Recall   : {recGB:.4f}")
    print(f"F1-score : {f1GB:.4f}")

print("\nAverage F1-score GB:", np.mean(F1ScoreListGB))
print("Std GB:", np.std(F1ScoreListGB))


Fold 0:
GradientBoosting Metrics:
F1-score : 0.6908
Fold 1:
GradientBoosting Metrics:
F1-score : 0.6907
Fold 2:
GradientBoosting Metrics:
F1-score : 0.6637
Fold 3:
GradientBoosting Metrics:
F1-score : 0.6848
Fold 4:
GradientBoosting Metrics:
F1-score : 0.7035


In [23]:
#ExtraTreesClassifier

from sklearn.ensemble import ExtraTreesClassifier

F1ScoreListET = []

for i, (train_index, val_index) in enumerate(skf_10.split(X_train, y_train)):

    X_tr = X_train.iloc[train_index]
    X_val = X_train.iloc[val_index]
    y_tr = y_train.iloc[train_index]
    y_val = y_train.iloc[val_index]

    modelET = ExtraTreesClassifier(
        n_estimators=300,
        max_depth=None,
        random_state=42,
        n_jobs=-1
    )

    modelET.fit(X_tr, y_tr)
    y_predET = modelET.predict(X_val)

    print(f"Fold {i}:")
    accET = accuracy_score(y_val, y_predET)
    precET = precision_score(y_val, y_predET)
    recET = recall_score(y_val, y_predET)
    f1ET = f1_score(y_val, y_predET)

    F1ScoreListET.append(f1ET)

    print("ExtraTrees Metrics:")
    #print(f"Accuracy : {accET:.4f}")
    #print(f"Precision: {precET:.4f}")
    #print(f"Recall   : {recET:.4f}")
    print(f"F1-score : {f1ET:.4f}")

print("\nAverage F1-score ET:", np.mean(F1ScoreListET))
print("Std ET:", np.std(F1ScoreListET))

Fold 0:
ExtraTrees Metrics:
F1-score : 0.9662
Fold 1:
ExtraTrees Metrics:
F1-score : 0.9509
Fold 2:
ExtraTrees Metrics:
F1-score : 0.9719
Fold 3:
ExtraTrees Metrics:
F1-score : 0.9513
Fold 4:
ExtraTrees Metrics:
F1-score : 0.9645
Fold 5:
ExtraTrees Metrics:
F1-score : 0.9695
Fold 6:
ExtraTrees Metrics:
F1-score : 0.9679
Fold 7:
ExtraTrees Metrics:
F1-score : 0.9621
Fold 8:
ExtraTrees Metrics:
F1-score : 0.9432
Fold 9:
ExtraTrees Metrics:
F1-score : 0.9598

Average F1-score ET: 0.9607242673000433
Std ET: 0.008887213470866927


In [24]:
#RandomForestClassifier

from sklearn.ensemble import RandomForestClassifier

F1ScoreListRF = []

for i, (train_index, val_index) in enumerate(skf_10.split(X_train, y_train)):

    X_tr = X_train.iloc[train_index]
    X_val = X_train.iloc[val_index]
    y_tr = y_train.iloc[train_index]
    y_val = y_train.iloc[val_index]

    modelRF = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        random_state=42,
        n_jobs=-1
    )

    modelRF.fit(X_tr, y_tr)
    y_predRF = modelRF.predict(X_val)

    print(f"Fold {i}:")
    accRF = accuracy_score(y_val, y_predRF)
    precRF = precision_score(y_val, y_predRF)
    recRF = recall_score(y_val, y_predRF)
    f1RF = f1_score(y_val, y_predRF)

    F1ScoreListRF.append(f1RF)

    print("RandomForest Metrics:")
    #print(f"Accuracy : {accRF:.4f}")
    #print(f"Precision: {precRF:.4f}")
    #print(f"Recall   : {recRF:.4f}")
    print(f"F1-score : {f1RF:.4f}")

print("\nAverage F1-score RF:", np.mean(F1ScoreListRF))
print("Std RF:", np.std(F1ScoreListRF))


Fold 0:
RandomForest Metrics:
F1-score : 0.9492
Fold 1:
RandomForest Metrics:
F1-score : 0.9488
Fold 2:
RandomForest Metrics:
F1-score : 0.9529
Fold 3:
RandomForest Metrics:
F1-score : 0.9459
Fold 4:
RandomForest Metrics:
F1-score : 0.9468
Fold 5:
RandomForest Metrics:
F1-score : 0.9577
Fold 6:
RandomForest Metrics:
F1-score : 0.9600
Fold 7:
RandomForest Metrics:
F1-score : 0.9464
Fold 8:
RandomForest Metrics:
F1-score : 0.9207
Fold 9:
RandomForest Metrics:
F1-score : 0.9480

Average F1-score RF: 0.9476277837799894
Std RF: 0.010082585452266746
