In [4]:
# --- Extraction depuis CSV ---
import csv
tweets_data = []
with open("MMM.csv", "r", encoding="latin1") as f:
    reader = csv.reader(f)
    for row in reader:
        if len(row) >= 6:
            raw_label = row[0]
            tweet = row[5]

            if raw_label == "0":
                label = "négatif"
            elif raw_label == "4":
                label = "positif"
            else:
                continue  # On ignore les autres labels

            tweets_data.append((tweet, label))

# --- Extraction des colonnes ---
raw_tweets = [t[0] for t in tweets_data]
labels = [t[1] for t in tweets_data]
print(raw_tweets[:5])
print(labels[:5])

["Looks like the sun finally located Trondheim ;-) hope summer's on it's way ", "A long weekend begins. The sun is shining and I'm happy ! Exams soon ", 'to the beach we go! hope it stays nice... ', '@JBFutureboy I missed it  busted need to do a reunion tour. That would make my year. No joke.', "Why I can't change my background image?? "]
['positif', 'négatif', 'positif', 'négatif', 'négatif']


In [5]:
#%load_ext cuml.accel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
import numpy as np
import pandas as pd

tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=15000,
    sublinear_tf=True,  # Atténuation de l'impact des termes fréquents
    analyzer='word',
    stop_words=[],
    #min_df=0.2,           # Ignorer les termes rares
    max_df=5        # Éliminer les termes trop courants
)

In [6]:
data = tfidf.fit_transform(raw_tweets)
X = data
Y = [1 if label == "positif" else -1 for label in labels]
n = data.shape[0]//5

X_train, X_test, y_train, y_test = train_test_split(X[:n], Y[:n], test_size=0.2, random_state=42)


Régression logistique

In [7]:
import numpy as np
from sklearn.model_selection import GridSearchCV

model = LogisticRegression(
    penalty='l2',
    C=1.0,               # Régularisation
    #l1_ratio=0.4,  # Ratio de régularisation L1 # This parameter is not available for 'liblinear' solver
    class_weight='balanced',  # Équilibrage des classes
    solver='liblinear',  # Algorithme de résolution
    max_iter=1000,       # Nombre maximal d'itérations
    random_state=42      # For reproducibility
)

# Convert y_train and y_test to NumPy arrays with a specified dtype and map -1 to 0
y_train_np = np.array([(1 if y == 1 else 0) for y in y_train], dtype=np.int32)
y_test_np = np.array([(1 if y == 1 else 0) for y in y_test], dtype=np.int32)

# Define param_grid before using it
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}
grid = GridSearchCV(model, param_grid, cv=10, scoring='accuracy', verbose=1)
grid.fit(X_train, y_train_np)
print(grid.best_params_)
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = best_model.score(X_test, y_test_np)
print("Test set accuracy: {:.2f}%".format((accuracy) * 100))

Fitting 10 folds for each of 12 candidates, totalling 120 fits
{'C': 0.001, 'penalty': 'l2'}
Test set accuracy: 51.29%


Random Forest

In [8]:
import numpy as np

rf_model = RandomForestClassifier(
    n_estimators=50,  # Nombre d'arbres
    criterion='gini',    # Critère de split (gini ou entropy)
    max_depth=None,      # Profondeur maximale de l'arbre
    min_samples_split=2, # Nombre minimum d'échantillons requis pour spliter un nœud interne
    min_samples_leaf=2,  # Nombre minimum d'échantillons requis à un nœud feuille
    random_state=42      # Pour la reproductibilité
)

# Convert sparse matrices to dense arrays for cuML compatibility
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Convert y_train and y_test to NumPy arrays with a specified dtype
y_train_np = np.array(y_train, dtype=np.float32)
y_test_np = np.array(y_test, dtype=np.float32)

rf_model.fit(X_train_dense, y_train_np)

y_pred_rf = rf_model.predict(X_test_dense)

accuracy_rf = rf_model.score(X_test_dense, y_test_np)
print(f"Random Forest Accuracy: {accuracy_rf:.2%}")

Random Forest Accuracy: 50.16%


Mixture of Experts

In [9]:
# Define the expert models
expert1 = LogisticRegression(
    penalty='l2',
    C=1.0,
    class_weight='balanced',
    solver='liblinear',
    max_iter=1000,
    random_state=42
)

expert2 = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

# Add another expert for diversification (e.g., an SVM)
expert3 = SVC(probability=True, random_state=42)


# Create the VotingClassifier (Mixture of Experts)
# 'voting='hard'' uses majority voting, 'voting='soft'' uses predicted probabilities
moe_model = VotingClassifier(
    estimators=[('lr', expert1), ('rf', expert2), ('svm', expert3)],
    voting='hard'
)

# Train the Mixture of Experts model
moe_model.fit(X_train, y_train)

# Make predictions with the Mixture of Experts model
y_pred_moe = moe_model.predict(X_test)

# Evaluate the Mixture of Experts model
accuracy_moe = moe_model.score(X_test, y_test)
print(f"Mixture of Experts Accuracy: {accuracy_moe:.2%}")

Mixture of Experts Accuracy: 51.36%


In [12]:
! pip install tensorflow keras

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Define the neural network model
input_layer = Input(shape=(X_train.shape[1],))
dense_layer_1 = Dense(128, activation='relu')(input_layer)
dropout_layer_1 = Dropout(0.5)(dense_layer_1)
dense_layer_2 = Dense(64, activation='relu')(dropout_layer_1)
dropout_layer_2 = Dropout(0.5)(dense_layer_2)
output_layer = Dense(1, activation='sigmoid')(dense_layer_2) # Using sigmoid for binary classification

nn_model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
nn_model.compile(optimizer=Adam(learning_rate=0.001),
                 loss='binary_crossentropy', # Using binary crossentropy for binary classification
                 metrics=['accuracy'])

# Since the target variables are -1 and 1, we need to convert them to 0 and 1 for binary crossentropy
y_train_nn = [(1 if y == 1 else 0) for y in y_train]
y_test_nn = [(1 if y == 1 else 0) for y in y_test]

# Train the neural network model
# Convert sparse matrix to dense array for neural network
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Convert NumPy arrays to TensorFlow Tensors
X_train_tensor = tf.convert_to_tensor(X_train_dense, dtype=tf.float32)
X_test_tensor = tf.convert_to_tensor(X_test_dense, dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y_train_nn, dtype=tf.float32)
y_test_tensor = tf.convert_to_tensor(y_test_nn, dtype=tf.float32)


nn_model.fit(X_train_tensor, y_train_tensor, epochs=50, batch_size=10, verbose=0)

# Evaluate the neural network model
loss, accuracy_nn = nn_model.evaluate(X_test_tensor, y_test_tensor, verbose=0)
print(f"Neural Network Accuracy: {accuracy_nn:.2%}")

# Predict with the neural network model
# The output is probabilities, convert to class labels (-1 or 1)
y_pred_nn_prob = nn_model.predict(X_test_tensor)
y_pred_nn = [(1 if prob > 0.5 else -1) for prob in y_pred_nn_prob]

print("Neural Network Predictions:", y_pred_nn)
print("Actual Test Labels:", y_test)

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-win_amd64.whl.metadata (4.1 kB)
Collecting keras
  Downloading keras-3.10.0-py3-none-any.whl.metadata (6.0 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.3.0-py3-none-any.whl.metadata (2.4 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-win_amd64.whl.metadata (5.3 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.

ERROR: Could not install packages due to an OSError: [WinError 5] Accès refusé: 'c:\\users\\matti\\appdata\\local\\programs\\python\\python311\\lib\\site-packages\\numpy.libs\\libscipy_openblas64_-13e2df515630b4a41f92893938845698.dll'
Consider using the `--user` option or check the permissions.



ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# prompt: Do the same with XGBoost

! pip install xgboost

import xgboost as xgb

# XGBoost requires DMatrix for its internal format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define XGBoost parameters
# Using 'binary:logistic' for binary classification and converting labels to 0/1
# for this loss function.
# The y_train and y_test currently contain -1 and 1. We need to map them to 0 and 1.
# 1 -> 1, -1 -> 0
y_train_xgb = [(1 if y == 1 else 0) for y in y_train]
y_test_xgb = [(1 if y == 1 else 0) for y in y_test]

dtrain = xgb.DMatrix(X_train, label=y_train_xgb)
dtest = xgb.DMatrix(X_test, label=y_test_xgb)


params = {
    'objective': 'binary:logistic',  # Binary classification with logistic regression
    'eval_metric': 'logloss',        # Evaluation metric
    'eta': 0.1,                      # Learning rate
    'max_depth': 3,                  # Maximum depth of trees
    'subsample': 0.8,                # Subsample ratio of the training instance
    'colsample_bytree': 0.8,         # Subsample ratio of columns when constructing each tree
    'seed': 42                       # Random seed for reproducibility
}

# Train the XGBoost model
num_rounds = 100  # Number of boosting rounds
watchlist = [(dtrain, 'train'), (dtest, 'eval')] # Watchlist to monitor performance

xgb_model = xgb.train(params, dtrain, num_rounds, evals=watchlist, early_stopping_rounds=10, verbose_eval=False)

# Make predictions
# XGBoost outputs probabilities for binary:logistic
y_pred_xgb_prob = xgb_model.predict(dtest)

# Convert probabilities to class labels (0 or 1)
y_pred_xgb = [1 if prob > 0.5 else 0 for prob in y_pred_xgb_prob]

# Convert back to the original -1, 1 format for comparison with y_test
y_pred_xgb_original = [1 if pred == 1 else -1 for pred in y_pred_xgb]


# Evaluate the model (using scikit-learn's accuracy_score)
from sklearn.metrics import accuracy_score
accuracy_xgb = accuracy_score(y_test_xgb, y_pred_xgb) # Use 0/1 for evaluation

print(f"XGBoost Accuracy: {accuracy_xgb:.2%}")

print("XGBoost Predictions (0/1):", y_pred_xgb)
print("XGBoost Predictions (-1/1):", y_pred_xgb_original)
print("Actual Test Labels (-1/1):", y_test)
print("Actual Test Labels (0/1):", y_test_xgb)


In [None]:
from sklearn.neighbors import KNeighborsClassifier

# kNN Classifier
knn_model = KNeighborsClassifier(
    n_neighbors=3,  # Number of neighbors to consider
    weights='uniform', # Weight function used in prediction (uniform or distance)
    algorithm='auto', # Algorithm used to compute the nearest neighbors
    leaf_size=30,     # Leaf size passed to BallTree or KDTree
    p=2,              # Power parameter for the Minkowski metric (p=2 for Euclidean distance)
    metric='minkowski' # Distance metric
)

knn_model.fit(X_train, y_train)

y_pred_knn = knn_model.predict(X_test)

accuracy_knn = knn_model.score(X_test, y_test)
print(f"kNN Accuracy: {accuracy_knn:.2%}")
print("kNN Predictions:", y_pred_knn)
print("Actual Test Labels:", y_test)
param_grid = {'n_neighbors': list(range(1, 31,2))}
grid = GridSearchCV(knn_model, param_grid, cv=10, scoring='MCC', verbose=1)
grid.fit(X_train, y_train)
print(grid.best_params_)
best_knn = grid.best_estimator_
y_pred = best_knn.predict(X_test)
print("Test set accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))

In [None]:
# Support Vector Machine (SVM)
svm_model = SVC(
    C=1.0,                 # Régularisation (plus C est grand, moins la régularisation est forte)
    kernel='linear',       # Type de noyau ('linear', 'poly', 'rbf', 'sigmoid')
    gamma='scale',         # Coefficient de noyau pour 'rbf', 'poly', 'sigmoid'
    probability=True,      # Permet de calculer les probabilités de classe
    random_state=42        # Pour la reproductibilité
)

svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)

accuracy_svm = svm_model.score(X_test, y_test)
print(f"SVM Accuracy: {accuracy_svm:.2%}")
print("SVM Predictions:", y_pred_svm)
print("Actual Test Labels:", y_test)