<a href="https://colab.research.google.com/github/Poly-Mathlete/-E-Social-Network-/blob/Models/models_EI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Extraction depuis CSV
import csv
tweets_data = []  # On initialise une liste vide pour stocker les tweets et leurs labels
with open("balanced_tweets_200k.csv", "r", encoding="latin1") as f:
    reader = csv.reader(f)
    for row in reader:
        if len(row) >= 6:
            raw_label = row[0]
            tweet = row[5]

            # Conversion du label brut en label textuel (positif/négatif)
            if raw_label == "0":
                label = "négatif"
            elif raw_label == "4":
                label = "positif"
            else:
                continue
            tweets_data.append((tweet, label))

# Extraction des colonnes
raw_tweets = [t[0] for t in tweets_data]
labels = [t[1] for t in tweets_data]
print(raw_tweets[:5])
print(labels[:5])

FileNotFoundError: [Errno 2] No such file or directory: 'balanced_tweets_200k.csv'

In [None]:
import re #pour utiliser expressions régulières
from pathlib import Path #pour manipuler les fichiers

import nltk #notre bibliothèque de traitement de texte documenté
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer #tokenizer : transforme les phrases en liste de mots

# Télécharger les ressources NLTK
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)

# Initialisation du tokenizer (plus robuste que word_tokenize)
tokenizer = TreebankWordTokenizer()

# Préparation des regex
# on supprime les artefacts typiques des tweets encodés de manière étrange
regex_artifacts_sources = re.compile(
    r'\|'
    r'\[Saut de retour à la ligne\]|'
    r'(?:<ed><U\+[0-9A-F]{4}><U\+[0-9A-F]{4}>)+|'
    r'<U\+[0-9A-F]{4,6}>|'
    r'<ed>'
)

# on supprime les noms d'utilisateurs mentionnés
regex_usernames = re.compile(r'@\w+')

# on supprime les URL (http ou https)
regex_urls = re.compile(r'https?://\S+')

# on supprime les hashtags uniquement
regex_hashtags = re.compile(r'#\w+')

# on supprime les contractions comme l', j', c', etc.
regex_apostrophes = re.compile(r"\b\w+'")

# on supprime les guillemets doubles
regex_quotes = re.compile(r'"')

# on supprime la ponctuation générale
regex_punctuation = re.compile(r'[.,;:!?()\[\]{}\\/|`~^<>«»=]')

# Traitement des fichiers

# Crée le dossier de sortie s’il n’existe pas
#output_dir = Path("CorpusRandomCleaned")
#output_dir.mkdir(parents=True, exist_ok=True)

# Définition du chemin de sortie pour les tweets nettoyés
#dest_file = output_dir / f"cleaned_tweets{i}.txt"

cleaned_tweets = []

# Choix de la langue pour les stopwords
langue = "english"
stop_words = set(stopwords.words(langue))

for tweet_text in raw_tweets:

    # Nettoyage du texte
    # Supprime les artefacts liés à l'encodage
    t = regex_artifacts_sources.sub('', tweet_text)
    # Corrige les doubles guillemets ("" → ")
    t = t.replace('""', '"')
    # Supprime les noms d’utilisateurs
    t = regex_usernames.sub('', t)
    # Supprime les liens
    t = regex_urls.sub('', t)
    # Supprime les hashtags
    t = regex_hashtags.sub('', t)
    # Supprime les contractions du type l', j', etc.
    t = regex_apostrophes.sub('', t)
    # Supprime tous les guillemets
    t = regex_quotes.sub('', t)
    # Supprime les apostrophes restantes
    t = t.replace("'", "")
    # Supprime la ponctuation
    t = regex_punctuation.sub(' ', t)
    # Met en minuscule
    t = t.lower()
    # Supprime les espaces multiples
    t = re.sub(r'\s{2,}', ' ', t).strip()

    # Tokenisation
    tokens = tokenizer.tokenize(t)

    # Conservation de RT en majuscule même après passage en minuscule
    filtered_tokens = [
        word.upper() if word.lower() == 'rt' else word
        for word in tokens
        if word.lower() not in stop_words or word.lower() == 'rt'
    ]

    # On garde le tweet sous forme de liste de mots
    cleaned_tweets.append(f"{filtered_tokens}")

# Sauvegarde du fichier nettoyé
#with open(dest_file, "w", encoding="utf-8") as f:
#    f.write("\n".join(cleaned_tweets))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
import numpy as np
import pandas as pd

tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=15000,      # Limite le vocabulaire
    min_df=5,               # Ignore les mots présents dans <5 documents
    max_df=0.7,             # Ignore les mots présents dans >70% des documents
    sublinear_tf=True,
    analyzer='word',
    stop_words='english'    # Utilise la liste de stop words intégrée
)

In [None]:
#data = tfidf.fit_transform(cleaned_tweets)
data = np.array(cleaned_tweets)
X = data
Y = [1 if label == "positif" else -1 for label in labels]
n = data.shape[0]

X_train, X_test, y_train, y_test = train_test_split(X[:n], Y[:n], test_size=0.2, random_state=42)


Régression logistique

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV

model = LogisticRegression(
    penalty='l2',
    C=1.0,               # Régularisation
    #l1_ratio=0.4,  # Ratio de régularisation L1 # This parameter is not available for 'liblinear' solver
    class_weight='balanced',  # Équilibrage des classes
    solver='liblinear',  # Algorithme de résolution
    max_iter=1000,       # Nombre maximal d'itérations
    random_state=42      # For reproducibility
)

# Convert y_train and y_test to NumPy arrays with a specified dtype and map -1 to 0
y_train_np = np.array([(1 if y == 1 else 0) for y in y_train], dtype=np.int32)
y_test_np = np.array([(1 if y == 1 else 0) for y in y_test], dtype=np.int32)

# Define param_grid before using it
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}
grid = GridSearchCV(model, param_grid, cv=10, scoring='accuracy', verbose=1)
grid.fit(X_train, y_train_np)
print(grid.best_params_)
best_model = grid.best_estimator_
best_model.fit(X_train, y_train_np)
y_pred = best_model.predict(X_test)
accuracy = best_model.score(X_test, y_test_np)
print("Test set accuracy: {:.2f}%".format((accuracy) * 100))

Fitting 10 folds for each of 12 candidates, totalling 120 fits


ValueError: 
All the 120 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py", line 1222, in fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py", line 1370, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: np.str_("['oh', 'geez', 'david', 'death', 'need', 'read', 'article', 'cnn']")

--------------------------------------------------------------------------------
108 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_logistic.py", line 1222, in fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py", line 1370, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: np.str_("['ahhhh', 'bullshit', 'got', 'school', 'hours']")


Random Forest

In [None]:
import numpy as np

rf_model = RandomForestClassifier(
    n_estimators=100,  # Nombre d'arbres
    criterion='gini',    # Critère de split (gini ou entropy)
    max_depth=None,      # Profondeur maximale de l'arbre
    min_samples_split=2, # Nombre minimum d'échantillons requis pour spliter un nœud interne
    min_samples_leaf=2,  # Nombre minimum d'échantillons requis à un nœud feuille
    random_state=42      # Pour la reproductibilité
)

# Convert sparse matrices to dense arrays for cuML compatibility
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Convert y_train and y_test to NumPy arrays with a specified dtype
y_train_np = np.array(y_train, dtype=np.float32)
y_test_np = np.array(y_test, dtype=np.float32)

rf_model.fit(X_train_dense, y_train_np)

y_pred_rf = rf_model.predict(X_test_dense)

accuracy_rf = rf_model.score(X_test_dense, y_test_np)
print(f"Random Forest Accuracy: {accuracy_rf:.2%}")

Random Forest Accuracy: 68.78%


Mixture of Experts

In [None]:
# Define the expert models
expert1 = LogisticRegression(
    penalty='l2',
    C=1.0,
    class_weight='balanced',
    solver='liblinear',
    max_iter=1000,
    random_state=42
)

expert2 = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

# Add another expert for diversification (e.g., an SVM)
expert3 = SVC(probability=True, random_state=42)


# Create the VotingClassifier (Mixture of Experts)
# 'voting='hard'' uses majority voting, 'voting='soft'' uses predicted probabilities
moe_model = VotingClassifier(
    estimators=[('lr', expert1), ('rf', expert2), ('svm', expert3)],
    voting='hard'
)

# Train the Mixture of Experts model
moe_model.fit(X_train, y_train)

# Make predictions with the Mixture of Experts model
y_pred_moe = moe_model.predict(X_test)

# Evaluate the Mixture of Experts model
accuracy_moe = moe_model.score(X_test, y_test)
print(f"Mixture of Experts Accuracy: {accuracy_moe:.2%}")

[2025-06-04 22:00:43.631] [CUML] [info] Unused keyword parameter: random_state during cuML estimator initialization
[2025-06-04 22:00:43.639] [CUML] [info] Unused keyword parameter: dual during cuML estimator initialization
[2025-06-04 22:00:43.639] [CUML] [info] Unused keyword parameter: intercept_scaling during cuML estimator initialization
[2025-06-04 22:00:43.639] [CUML] [info] Unused keyword parameter: multi_class during cuML estimator initialization
[2025-06-04 22:00:43.639] [CUML] [info] Unused keyword parameter: n_jobs during cuML estimator initialization
[2025-06-04 22:00:43.639] [CUML] [info] Unused keyword parameter: random_state during cuML estimator initialization
[2025-06-04 22:00:43.639] [CUML] [info] Unused keyword parameter: warm_start during cuML estimator initialization
Mixture of Experts Accuracy: 72.60%


In [None]:
#!pip install tensorflow keras

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Define the neural network model
input_layer = Input(shape=(X_train.shape[1],))
dense_layer_1 = Dense(128, activation='relu')(input_layer)
dropout_layer_1 = Dropout(0.5)(dense_layer_1)
dense_layer_2 = Dense(64, activation='relu')(dropout_layer_1)
dropout_layer_2 = Dropout(0.5)(dense_layer_2)
output_layer = Dense(1, activation='sigmoid')(dense_layer_2) # Using sigmoid for binary classification

nn_model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
nn_model.compile(optimizer=Adam(learning_rate=0.001),
                 loss='binary_crossentropy', # Using binary crossentropy for binary classification
                 metrics=['accuracy'])

# Since the target variables are -1 and 1, we need to convert them to 0 and 1 for binary crossentropy
y_train_nn = [(1 if y == 1 else 0) for y in y_train]
y_test_nn = [(1 if y == 1 else 0) for y in y_test]

# Train the neural network model
# Convert sparse matrix to dense array for neural network
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Convert NumPy arrays to TensorFlow Tensors
X_train_tensor = tf.convert_to_tensor(X_train_dense, dtype=tf.float32)
X_test_tensor = tf.convert_to_tensor(X_test_dense, dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y_train_nn, dtype=tf.float32)
y_test_tensor = tf.convert_to_tensor(y_test_nn, dtype=tf.float32)


nn_model.fit(X_train_tensor, y_train_tensor, epochs=50, batch_size=10, verbose=0)

# Evaluate the neural network model
loss, accuracy_nn = nn_model.evaluate(X_test_tensor, y_test_tensor, verbose=0)
print(f"Neural Network Accuracy: {accuracy_nn:.2%}")

# Predict with the neural network model
# The output is probabilities, convert to class labels (-1 or 1)
y_pred_nn_prob = nn_model.predict(X_test_tensor)
y_pred_nn = [(1 if prob > 0.5 else -1) for prob in y_pred_nn_prob]

print("Neural Network Predictions:", y_pred_nn)
print("Actual Test Labels:", y_test)

Neural Network Accuracy: 70.73%
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Neural Network Predictions: [-1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 

In [None]:
# prompt: Do the same with XGBoost

#!pip install xgboost

import xgboost as xgb

# XGBoost requires DMatrix for its internal format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define XGBoost parameters
# Using 'binary:logistic' for binary classification and converting labels to 0/1
# for this loss function.
# The y_train and y_test currently contain -1 and 1. We need to map them to 0 and 1.
# 1 -> 1, -1 -> 0
y_train_xgb = [(1 if y == 1 else 0) for y in y_train]
y_test_xgb = [(1 if y == 1 else 0) for y in y_test]

dtrain = xgb.DMatrix(X_train, label=y_train_xgb)
dtest = xgb.DMatrix(X_test, label=y_test_xgb)


params = {
    'objective': 'binary:logistic',  # Binary classification with logistic regression
    'eval_metric': 'logloss',        # Evaluation metric
    'eta': 0.5,                      # Learning rate
    'max_depth': 3,                  # Maximum depth of trees
    'subsample': 0.8,                # Subsample ratio of the training instance
    'colsample_bytree': 0.8,         # Subsample ratio of columns when constructing each tree
    'seed': 42                       # Random seed for reproducibility
}

# Train the XGBoost model
num_rounds = 100  # Number of boosting rounds
watchlist = [(dtrain, 'train'), (dtest, 'eval')] # Watchlist to monitor performance

xgb_model = xgb.train(params, dtrain, num_rounds, evals=watchlist, early_stopping_rounds=10, verbose_eval=False)

# Make predictions
# XGBoost outputs probabilities for binary:logistic
y_pred_xgb_prob = xgb_model.predict(dtest)

# Convert probabilities to class labels (0 or 1)
y_pred_xgb = [1 if prob > 0.5 else 0 for prob in y_pred_xgb_prob]

# Convert back to the original -1, 1 format for comparison with y_test
y_pred_xgb_original = [1 if pred == 1 else -1 for pred in y_pred_xgb]


# Evaluate the model (using scikit-learn's accuracy_score)
from sklearn.metrics import accuracy_score
accuracy_xgb = accuracy_score(y_test_xgb, y_pred_xgb) # Use 0/1 for evaluation

print(f"XGBoost Accuracy: {accuracy_xgb:.2%}")

print("XGBoost Predictions (0/1):", y_pred_xgb)
print("XGBoost Predictions (-1/1):", y_pred_xgb_original)
print("Actual Test Labels (-1/1):", y_test)
print("Actual Test Labels (0/1):", y_test_xgb)


XGBoost Accuracy: 69.83%
XGBoost Predictions (0/1): [1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# kNN Classifier
knn_model = KNeighborsClassifier(
    n_neighbors=3,  # Number of neighbors to consider
    weights='uniform', # Weight function used in prediction (uniform or distance)
    algorithm='auto', # Algorithm used to compute the nearest neighbors
    leaf_size=30,     # Leaf size passed to BallTree or KDTree
    p=2,              # Power parameter for the Minkowski metric (p=2 for Euclidean distance)
    metric='minkowski' # Distance metric
)

# Convert y_train and y_test to NumPy arrays with a specified dtype
y_train_np = np.array(y_train, dtype=np.int32)
y_test_np = np.array(y_test, dtype=np.int32)

knn_model.fit(X_train, y_train_np)

y_pred_knn = knn_model.predict(X_test)

accuracy_knn = knn_model.score(X_test, y_test_np)
#print(f"kNN Accuracy: {accuracy_knn:.2%}")
#print("kNN Predictions:", y_pred_knn)
#print("Actual Test Labels:", y_test_np)
param_grid = {'n_neighbors': list(range(1, 31,2))}
grid = GridSearchCV(knn_model, param_grid, cv=10, scoring='accuracy', verbose=1)
grid.fit(X_train, y_train_np)
print(grid.best_params_)
best_knn = grid.best_estimator_
y_pred = best_knn.predict(X_test)
print("Test set accuracy: {:.2f}%".format(accuracy_score(y_test_np, y_pred) * 100))

NameError: name 'y_train' is not defined

In [None]:
# Support Vector Machine (SVM)
svm_model = SVC(
    C=1.0,                 # Régularisation (plus C est grand, moins la régularisation est forte)
    kernel='linear',       # Type de noyau ('linear', 'poly', 'rbf', 'sigmoid')
    gamma='scale',         # Coefficient de noyau pour 'rbf', 'poly', 'sigmoid'
    probability=True,      # Permet de calculer les probabilités de classe
    random_state=42        # Pour la reproductibilité
)

svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)

accuracy_svm = svm_model.score(X_test, y_test)
print(f"SVM Accuracy: {accuracy_svm:.2%}")
print("SVM Predictions:", y_pred_svm)
print("Actual Test Labels:", y_test)

SVM Accuracy: 70.22%
SVM Predictions: [ 1  1 -1 ...  1  1  1]
Actual Test Labels: [1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1, 1