In [None]:
from ensurepip import bootstrap
import pandas as pd
import time
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score


tweet_bow = pd.read_csv(r"C:\Users\aubin\OneDrive - Aix-Marseille Université\AIX EN PROVENCE\MASTER 1 ECONOMETRIE STATISTIQUE\S2\Machine_Learning\projet_final\data\tweets_bow (1).csv")


tweet_bow['label'] = tweet_bow['label'].apply(lambda x: 0 if x == 0 else 1)


X = tweet_bow.drop(columns=['label'])
y = tweet_bow['label']


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


def evaluer_modele(nom, modele, X_train, y_train, X_test, y_test, hyperparams):
    start = time.time()
    modele.fit(X_train, y_train)
    end = time.time()

    y_pred = modele.predict(X_test)

    print(f"\n=== Résultats pour {nom} ===")
    print("Classification Report :\n", classification_report(y_test, y_pred, digits=4))
    print("Temps d'entraînement :", round(end - start, 2), "secondes")
    print("Hyperparamètres :", hyperparams)

    return {
        "Modèle": nom,
        "Précision": round(precision_score(y_test, y_pred), 4),
        "Rappel": round(recall_score(y_test, y_pred), 4),
        "F1-score pondéré": round(f1_score(y_test, y_pred, average='weighted'), 4),
        "Accuracy": round(accuracy_score(y_test, y_pred), 4),
        "Temps entraînement (s)": round(end - start, 2),
        "Hyperparamètres": hyperparams
    }


rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)


resultats = []


resultats.append(evaluer_modele(
    "Random Forest",
    rf_model,
    X_train, y_train, X_test, y_test,
    "n_estimators=100, max_depth=None, criterion='gini'"
))


param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True],
    'class_weight':[None,'balanced']
}

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

grid_rf = GridSearchCV(
    estimator = RandomForestClassifier(random_state=42), param_grid=param_grid_rf,
    scoring='f1_weighted',
    cv=5,
    verbose=2,
    n_jobs=-1
)

grid_rf.fit(X_train, y_train)


print(" Meilleurs hyperparamètres trouvés :")
print(grid_rf.best_params_)

print("\n Score moyen (F1 pondéré) sur validation croisée :")
print(round(grid_rf.best_score_, 4))


best_rf = grid_rf.best_estimator_
y_pred_rf_opt = best_rf.predict(X_test)

print("\n Évaluation finale sur test set (modèle optimisé) :")
print(classification_report(y_test, y_pred_rf_opt, digits=4))


Comparison

In [None]:
from sklearn.metrics import f1_score, accuracy_score
import matplotlib.pyplot as plt


# run the default model rf
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("Classification Report :\n", classification_report(y_test, y_pred, digits=4))


# run the fine-tuned model rf
rf_tuned = RandomForestClassifier(
    bootstrap=True,
    random_state=42,
    n_estimators=200,
    class_weight='balanced',
    max_depth=None,
    min_samples_leaf=1,
    min_samples_split=5
)
rf_tuned.fit(X_train, y_train)
y_pred_rf_tuned = rf_tuned.predict(X_test)
print("Classification Report :\n", classification_report(y_test, y_pred_rf_tuned, digits=4))



# estimating the error, f1-score(weighted) and the diff betwenn models
error = 1-accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
error_tuned = 1-accuracy_score(y_test, y_pred_rf_tuned)
f1_score_tuned = f1_score(y_test, y_pred_rf_tuned, average='weighted')
delta_error = error - error_tuned
delta_f1 = f1_score_tuned - f1

Plotting

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(12, 5))

ax1.bar(['RF', 'RF_tuned'],[error,error_tuned], color=['gray','green'])
ax1.set_title('Difference betwenn errors of Random Forest model')
ax1.set_ylim(0.1075,0.1175)
ax1.set_ylabel('Error')
ax1.grid(axis='y',linestyle='dotted', alpha=0.4)
ax1.text(0.5, -0.15, f"Δ Test Error = {delta_error:.3f}", ha='center',
         fontsize=11, fontweight='bold', transform=ax1.transAxes)

ax2.bar(['RF','RF_tuned'],[f1,f1_score_tuned], color=['gray','green'])
ax2.set_title('Difference of average f1-score of Random Forest model')
ax2.set_ylabel('f1_score')
ax2.set_ylim(0.8775,0.885)
ax2.grid(axis='y',linestyle='dotted', alpha=0.4)
ax2.text(0.5, -0.15, f"Δ F1-score = {delta_f1:.3f}", ha='center',
         fontsize=11, fontweight='bold', transform=ax2.transAxes)

plt.suptitle('Comparison of the performance of the Random Forest model', fontsize=14)
plt.tight_layout()
plt.savefig(r"C:\Users\aubin\OneDrive - Aix-Marseille Université\AIX EN PROVENCE\MASTER 1 ECONOMETRIE STATISTIQUE\S2\Machine_Learning\projet_final\output\grading_criterion_2.2_rf.png", dpi=300, bbox_inches='tight')
plt.show()