In [16]:
import os
import sys

current_dir = os.getcwd()
module_path = os.path.abspath(os.path.join(current_dir, '..', 'tokenizer'))
sys.path.append(module_path)

from tokenizerv2 import TokenizerV2


In [None]:
import numpy as np
# Load train / test numpy arrays

train_data = os.path.join(current_dir, '../data', 'tf_idf_train.npy')
test_data = os.path.join(current_dir, '../data', 'tf_idf_test.npy')

from datasets import load_dataset

train_data = np.load(train_data)
test_data = np.load(test_data)

# Fully labeled dataset
ds = load_dataset("stanfordnlp/imdb")

# Load labels

train_labels = np.array(ds['train']['label'])
test_labels = np.array(ds['test']['label'])

In [33]:
import numpy as np
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def initialisation(dim):
    w = np.zeros(dim, dtype=np.float32)
    b = 0.0
    return w, b

In [38]:
def logistic_regression(X, y, iterations=2000, learning_rate=0.1):
    m, num_features = X.shape
    y = y.reshape(m)
    w, b = initialisation(num_features)
    costs = []
    accuracies = []
    epsilon = 1e-15
    for i in range(iterations):
        z = np.dot(X, w) + b
        a = sigmoid(z)
        cost = (-1 / m) * np.sum(y * np.log(a + epsilon) + (1 - y) * np.log(1 - a + epsilon))
        cost = np.squeeze(cost)
        dw = (1 / m) * np.dot(X.T, (a - y))
        db = (1 / m) * np.sum(a - y)
        w -= learning_rate * dw
        b -= learning_rate * db
        if i % 100 == 0 or i == iterations - 1:
            predictions = (a >= 0.5).astype(int) # Convertir probabilités en classes 0/1
            accuracy = np.mean(predictions == y) # Comparer aux vraies étiquettes Y

            costs.append(cost)
            accuracies.append(accuracy) # Enregistrer l'accuracy

            print(f"Iteration {i}: Cost {cost:.6f} - Training Accuracy: {accuracy * 100:.2f}%")
    params = {
        'w': w,
        'b': b
    }
    history = {
        'costs': costs,
        'accuracies': accuracies
    }
    return params, history

In [None]:
params, costs = logistic_regression(train_data, train_labels, iterations=2000, learning_rate=0.1)

Iteration 0: Cost 0.693147 - Training Accuracy: 50.00%
Iteration 100: Cost 0.688271 - Training Accuracy: 75.19%
Iteration 200: Cost 0.683544 - Training Accuracy: 75.31%
Iteration 300: Cost 0.678960 - Training Accuracy: 75.39%
Iteration 400: Cost 0.674512 - Training Accuracy: 75.56%
Iteration 500: Cost 0.670195 - Training Accuracy: 75.66%
Iteration 600: Cost 0.666002 - Training Accuracy: 75.73%
Iteration 700: Cost 0.661930 - Training Accuracy: 75.78%
Iteration 800: Cost 0.657972 - Training Accuracy: 75.88%
Iteration 900: Cost 0.654125 - Training Accuracy: 75.96%


In [41]:
def logistic_regression_predict(X, params):
    w = params['w']
    b = params['b']
    z = np.dot(X, w) + b
    a = sigmoid(z)
    predictions = (a >= 0.5).astype(int)
    accuracy = np.mean(predictions == test_labels)
    print(f"Test Accuracy: {accuracy * 100:.2f}%")
    return predictions, accuracy
predictions, accuracy = logistic_regression_predict(test_data, params)


Test Accuracy: 74.50%
