# KNN

In [None]:
from collections import Counter

import numpy as np


def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))


class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        # Compute distances between x and all examples in the training set
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        # Sort by distance and return indices of the first k neighbors
        k_idx = np.argsort(distances)[: self.k]
        # Extract the labels of the k nearest neighbor training samples
        k_neighbor_labels = [self.y_train[i] for i in k_idx]
        # return the most common class label
        most_common = Counter(k_neighbor_labels).most_common(1)
        return most_common[0][0]


if __name__ == "__main__":
    # Imports
    from matplotlib.colors import ListedColormap
    from sklearn import datasets
    from sklearn.model_selection import train_test_split

    cmap = ListedColormap(["#FF0000", "#00FF00", "#0000FF"])

    def accuracy(y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy

    iris = datasets.load_iris()
    X, y = iris.data, iris.target

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1234
    )

    k = 3
    clf = KNN(k=k)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print("KNN classification accuracy", accuracy(y_test, predictions))

In [None]:
X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

clf = KNN(k=3)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
acc = np.sum(predictions == y_test) / len(y_test)
print(f"KNN classification accuracy: {acc}")
print("KNN classification accuracy", accuracy(y_test, predictions))

# Linear Regression 

In [None]:
import numpy as np


def r2_score(y_true, y_pred):
    corr_matrix = np.corrcoef(y_true, y_pred)
    corr = corr_matrix[0, 1]
    return corr ** 2


class LinearRegression:
    def __init__(self, learning_rate=0.001, n_iters=1000):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # init parameters
        self.weights = np.zeros(n_features)
        self.bias = 0

        # gradient descent
        for _ in range(self.n_iters):
            y_predicted = np.dot(X, self.weights) + self.bias
            # compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            # update parameters
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        y_approximated = np.dot(X, self.weights) + self.bias
        return y_approximated


# Testing
if __name__ == "__main__":
    # Imports
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split
    from sklearn import datasets

    def mean_squared_error(y_true, y_pred):
        return np.mean((y_true - y_pred) ** 2)

    X, y = datasets.make_regression(
        n_samples=100, n_features=1, noise=20, random_state=4
    )

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1234
    )

    regressor = LinearRegression(learning_rate=0.01, n_iters=1000)
    regressor.fit(X_train, y_train)
    predictions = regressor.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    print("MSE:", mse)

    accu = r2_score(y_test, predictions)
    print("Accuracy:", accu)

    y_pred_line = regressor.predict(X)
    cmap = plt.get_cmap("viridis")
    fig = plt.figure(figsize=(8, 6))
    m1 = plt.scatter(X_train, y_train, color=cmap(0.9), s=10)
    m2 = plt.scatter(X_test, y_test, color=cmap(0.5), s=10)
    plt.plot(X, y_pred_line, color="black", linewidth=2, label="Prediction")
    plt.show()

In [None]:
X,y = datasets.make_regression(n_samples=100, n_features=1, noise=20, random_state=4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
predictions = regressor.predict(X_test)
mse = mean_squared_error(y_test, predictions)

# Logistic Regression

In [None]:
import numpy as np


class LogisticRegression:
    def __init__(self, learning_rate=0.001, n_iters=1000):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # init parameters
        self.weights = np.zeros(n_features)
        self.bias = 0

        # gradient descent
        for _ in range(self.n_iters):
            # approximate y with linear combination of weights and x, plus bias
            linear_model = np.dot(X, self.weights) + self.bias
            # apply sigmoid function
            y_predicted = self._sigmoid(linear_model)

            # compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)
            # update parameters
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self._sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return np.array(y_predicted_cls)

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))


# Testing
if __name__ == "__main__":
    # Imports
    from sklearn.model_selection import train_test_split
    from sklearn import datasets

    def accuracy(y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy

    bc = datasets.load_breast_cancer()
    X, y = bc.data, bc.target

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1234
    )

    regressor = LogisticRegression(learning_rate=0.0001, n_iters=1000)
    regressor.fit(X_train, y_train)
    predictions = regressor.predict(X_test)

    print("LR classification accuracy:", accuracy(y_test, predictions))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
regressor = LogisticRegression(learning_rate=0.0001, n_iters=1000)
regressor.fit(X_train, y_train)
predictions = regressor.predict(X_test)
acc = np.sum(predictions == y_test) / len(y_test)
print(f"LR classification accuracy: {acc}")

# PCA

In [None]:
import numpy as np


class PCA:
    def __init__(self, n_components):
        self.n_components = n_components
        self.components = None
        self.mean = None

    def fit(self, X):
        # Mean centering
        self.mean = np.mean(X, axis=0)
        X = X - self.mean

        # covariance, function needs samples as columns
        cov = np.cov(X.T)

        # eigenvalues, eigenvectors
        eigenvalues, eigenvectors = np.linalg.eig(cov)

        # -> eigenvector v = [:,i] column vector, transpose for easier calculations
        # sort eigenvectors
        eigenvectors = eigenvectors.T
        idxs = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idxs]
        eigenvectors = eigenvectors[idxs]

        # store first n eigenvectors
        self.components = eigenvectors[0 : self.n_components]

    def transform(self, X):
        # project data
        X = X - self.mean
        return np.dot(X, self.components.T)


# Testing
if __name__ == "__main__":
    # Imports
    import matplotlib.pyplot as plt
    from sklearn import datasets

    # data = datasets.load_digits()
    data = datasets.load_iris()
    X = data.data
    y = data.target

    # Project the data onto the 2 primary principal components
    pca = PCA(2)
    pca.fit(X)
    X_projected = pca.transform(X)

    print("Shape of X:", X.shape)
    print("Shape of transformed X:", X_projected.shape)

    x1 = X_projected[:, 0]
    x2 = X_projected[:, 1]

    plt.scatter(
        x1, x2, c=y, edgecolor="none", alpha=0.8, cmap=plt.cm.get_cmap("viridis", 3)
    )

    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.colorbar()
    plt.show()

# Résumer de cours 

## Reg Lin Simple

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

class LinearRegression:
    def __init__(self, learning_rate=0.00002, n_iterations=10000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.theta = None
        self.cost_history = None

    def load_data(self, filepath):
        self.dataset = pd.read_csv(filepath)
        return self.dataset

    def visualize_data(self):
        numeric_columns = self.dataset.drop(columns=['Sales'])
        plt.figure()
        sns.heatmap(numeric_columns.corr(), annot=True)
        plt.show()

        plt.figure(figsize=(18, 5))
        for i, feature in enumerate(['TV', 'Radio', 'Newspaper'], 1):
            plt.subplot(1, 3, i)
            plt.scatter(self.dataset[feature], self.dataset['Sales'])
            plt.title(f'{feature} vs Sales')
            plt.xlabel(feature)
            plt.ylabel('Sales')
        plt.show()

    def prepare_data(self, feature):
        self.y = self.dataset['Sales'].values.reshape(-1, 1)
        self.X = self.dataset[feature].values.reshape(-1, 1)

        # Normalisation des features
        scaler = StandardScaler()
        self.X = scaler.fit_transform(self.X)

        # Ajout d'une colonne de biais
        self.X = np.hstack((self.X, np.ones((self.X.shape[0], 1))))

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)
        self.theta = np.random.randn(2, 1)

    def model(self, X):
        return X.dot(self.theta)

    def cost_function(self, X, y):
        m = len(y)
        return 1 / (2 * m) * np.sum((self.model(X) - y) ** 2)

    def grad(self, X, y):
        m = len(y)
        return 1 / m * X.T.dot(self.model(X) - y)

    def gradient_descent(self):
        self.cost_history = np.zeros(self.n_iterations)
        for i in range(self.n_iterations):
            self.theta -= self.learning_rate * self.grad(self.X_train, self.y_train)
            self.cost_history[i] = self.cost_function(self.X_train, self.y_train)
        return self.theta, self.cost_history

    def train(self):
        return self.gradient_descent()

    def predict(self, X):
        return self.model(X)

    def plot_results(self):
        plt.figure(figsize=(18, 5))
        plt.subplot(1, 2, 1)
        plt.plot(self.X_train[:, 0], self.y_train, 'o', label='Training set')
        plt.plot(self.X_train[:, 0], self.model(self.X_train), 'r-', label='Linear model')
        plt.xlabel('Feature')
        plt.ylabel('Sales')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(range(self.n_iterations), self.cost_history)
        plt.xlabel('Iterations')
        plt.ylabel('Cost')
        plt.title('Cost Function History')
        plt.show()

    def evaluate(self):
        def coef_determination(y, pred):
            u = ((y - pred) ** 2).sum()
            v = ((y - y.mean()) ** 2).sum()
            return 1 - u / v

        y_train_pred = self.predict(self.X_train)
        y_test_pred = self.predict(self.X_test)

        print(f"Training R^2: {coef_determination(self.y_train, y_train_pred)}")
        print(f"Test R^2: {coef_determination(self.y_test, y_test_pred)}")

# Utilisation de la classe pour TV
regression = LinearRegression()
dataset = regression.load_data("Advertising.csv")
regression.visualize_data()
regression.prepare_data('TV')
theta, cost_history = regression.train()
regression.plot_results()
regression.evaluate()

# Utilisation de la classe pour Newspaper
regression.prepare_data('Newspaper')
theta, cost_history = regression.train()
regression.plot_results()
regression.evaluate()

# Utilisation de la classe pour Radio
regression.prepare_data('Radio')
theta, cost_history = regression.train()
regression.plot_results()
regression.evaluate()


## Reg Lin Multiple

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

class LinearRegressionMultiple:
    def __init__(self, learning_rate=0.00004, n_iterations=1000000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.theta = None
        self.cost_history = None

    def load_data(self, filepath):
        self.dataset = pd.read_csv(filepath)
        return self.dataset

    def visualize_data(self):
        numeric_columns = self.dataset.drop(columns=['Sales'])
        plt.figure()
        sns.heatmap(numeric_columns.corr(), annot=True)
        plt.show()

        plt.figure(figsize=(18, 5))
        features = ['TV', 'Radio', 'Newspaper']
        for i, feature in enumerate(features, 1):
            plt.subplot(1, 3, i)
            plt.scatter(self.dataset[feature], self.dataset['Sales'])
            plt.title(f'{feature} vs Sales')
            plt.xlabel(feature)
            plt.ylabel('Sales')
        plt.show()

    def prepare_data(self):
        self.y = self.dataset['Sales'].values.reshape(-1, 1)
        self.X = self.dataset[['TV', 'Radio', 'Newspaper']].values

        # Normalisation des features
        scaler = StandardScaler()
        self.X = scaler.fit_transform(self.X)

        # Ajout d'une colonne de biais
        self.X = np.hstack((self.X, np.ones((self.X.shape[0], 1))))

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)
        self.theta = np.random.randn(self.X_train.shape[1], 1)

    def model(self, X):
        return X.dot(self.theta)

    def cost_function(self, X, y):
        m = len(y)
        return 1 / (2 * m) * np.sum((self.model(X) - y) ** 2)

    def grad(self, X, y):
        m = len(y)
        return 1 / m * X.T.dot(self.model(X) - y)

    def gradient_descent(self):
        self.cost_history = np.zeros(self.n_iterations)
        for i in range(self.n_iterations):
            self.theta -= self.learning_rate * self.grad(self.X_train, self.y_train)
            self.cost_history[i] = self.cost_function(self.X_train, self.y_train)
        return self.theta, self.cost_history

    def train(self):
        return self.gradient_descent()

    def predict(self, X):
        return self.model(X)

    def plot_results(self):
        fig = plt.figure(figsize=(18, 5))
        ax = fig.add_subplot(111, projection='3d')
        sc = ax.scatter(self.X_train[:, 0], self.X_train[:, 1], self.X_train[:, 2], c=self.y_train, cmap='viridis')
        cbar = plt.colorbar(sc, label='Sales')
        ax.set_xlabel('TV')
        ax.set_ylabel('Radio')
        ax.set_zlabel('Newspaper')
        plt.show()

        plt.plot(range(self.n_iterations), self.cost_history)
        plt.xlabel('Iterations')
        plt.ylabel('Cost')
        plt.title('Cost Function History')
        plt.show()

    def evaluate(self):
        def coef_determination(y, pred):
            u = ((y - pred) ** 2).sum()
            v = ((y - y.mean()) ** 2).sum()
            return 1 - u / v

        y_train_pred = self.predict(self.X_train)
        y_test_pred = self.predict(self.X_test)

        print(f"Training R^2: {coef_determination(self.y_train, y_train_pred)}")
        print(f"Test R^2: {coef_determination(self.y_test, y_test_pred)}")

# Utilisation de la classe
regression_multiple = LinearRegressionMultiple()
dataset = regression_multiple.load_data("Advertising.csv")
regression_multiple.visualize_data()
regression_multiple.prepare_data()
theta_final, cost_history = regression_multiple.train()
regression_multiple.plot_results()
regression_multiple.evaluate()


## Regr Polynomial Univarie

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
%matplotlib inline

# Chargement du dataset Iris
dataset = pd.read_csv("iris.csv")

# Visualisation des données
plt.plot(dataset["petal_length"], dataset["sepal_width"], 'o')
plt.xlabel('petal_length')
plt.ylabel('sepal_width')
plt.show()

# Préparation des données
x = dataset.petal_length.values.reshape(-1, 1)
y = dataset.sepal_width.values.reshape(-1, 1)

# Normalisation des features
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

# Ajout de termes polynomiaux et d'une colonne de biais
X_poly = np.hstack((x_scaled**2, x_scaled, np.ones(x_scaled.shape)))

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Initialisation des paramètres du modèle
np.random.seed(0)
theta = np.random.randn(X_train.shape[1], 1)

# Définition des fonctions du modèle, de coût, de gradient et de descente de gradient
def model(X, theta):
    return X.dot(theta)

def cost_function(X, y, theta):
    m = len(y)
    return 1 / (2 * m) * np.sum((model(X, theta) - y) ** 2)

def grad(X, y, theta):
    m = len(y)
    return 1 / m * X.T.dot(model(X, theta) - y)

def gradient_descent(X, y, theta, learning_rate, n_iterations):
    cost_history = np.zeros(n_iterations)
    for i in range(n_iterations):
        theta -= learning_rate * grad(X, y, theta)
        cost_history[i] = cost_function(X, y, theta)
    return theta, cost_history

# Phase d'entraînement
n_iterations = 40000
learning_rate = 0.004
theta_final, cost_history = gradient_descent(X_train, y_train, theta, learning_rate, n_iterations)

# Visualisation des résultats
plt.plot(x, y, 'o', label='dataset')
plt.plot(x, model(X_poly, theta_final), c='r', label='modèle final')
plt.xlabel('petal_length')
plt.ylabel('sepal_width')
plt.legend()
plt.show()

# Prédictions sur l'ensemble de test
predictions = model(X_test, theta_final)
plt.plot(x_test, y_test, 'o', label='dataset test')
plt.plot(x_test, predictions, c='r', label='prédictions')
plt.xlabel('petal_length')
plt.ylabel('sepal_width')
plt.legend()
plt.show()

# Courbe d'apprentissage
plt.plot(range(n_iterations), cost_history)
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.title('Cost Function History')
plt.show()

# Évaluation du modèle - Coefficient de détermination
def coef_determination(y, pred):
    u = ((y - pred) ** 2).sum()
    v = ((y - y.mean()) ** 2).sum()
    return 1 - u / v

# Calcul du coefficient de détermination pour l'ensemble de test
print(f"Test R^2: {coef_determination(y_test, predictions)}")

# Information sur le dataset
dataset.info()


## Reg POly Multivariee

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

class PolynomialRegressionModel:
    def __init__(self, data_path, target_col, feature_col, max_degree=17, test_size=0.3, random_state=101):
        self.data_path = data_path
        self.target_col = target_col
        self.feature_col = feature_col
        self.max_degree = max_degree
        self.test_size = test_size
        self.random_state = random_state
        self.load_data()

    def load_data(self):
        self.df = pd.read_csv(self.data_path)
        self.x = self.df[[self.feature_col]].values
        self.y = self.df[[self.target_col]].values.reshape(-1, 1)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.x, self.y, test_size=self.test_size, random_state=self.random_state)
        self.X_train = np.array(self.X_train).reshape(-1, 1)
        self.X_test = np.array(self.X_test).reshape(-1, 1)

    def plot_corr_matrix(self):
        plt.figure()
        sns.heatmap(self.df.corr(), annot=True)
        plt.show()

    def plot_data(self):
        plt.plot(self.y, self.x, 'o')
        plt.xlabel(self.feature_col)
        plt.ylabel(self.target_col)
        plt.show()

    def select_best_polynomial_degree(self):
        bic_values = []
        models = []

        for degree in range(1, self.max_degree + 1):
            poly_features = PolynomialFeatures(degree=degree)
            X_train_poly = poly_features.fit_transform(self.X_train)
            model = LinearRegression()
            model.fit(X_train_poly, self.y_train)
            num_params = X_train_poly.shape[1]
            y_train_pred = model.predict(X_train_poly)
            log_likelihood = -0.5 * len(self.X_train) * np.log(mean_squared_error(self.y_train, y_train_pred))
            bic = -2 * log_likelihood + num_params * np.log(len(self.X_train))
            bic_values.append(bic)
            models.append(model)

        self.best_model_idx = np.argmin(bic_values)
        self.best_model = models[self.best_model_idx]
        self.best_degree = self.best_model_idx + 1

        plt.plot(range(1, self.max_degree + 1), bic_values, marker='o')
        plt.xlabel('Polynomial Degree')
        plt.ylabel('BIC Value')
        plt.title('BIC Values vs. Polynomial Degree')
        plt.xticks(range(1, self.max_degree + 1))
        plt.grid(True)
        plt.show()

    def prepare_polynomial_features(self, degree):
        poly_features = PolynomialFeatures(degree=degree)
        self.X_train_poly = poly_features.fit_transform(self.X_train)
        self.X_test_poly = poly_features.fit_transform(self.X_test)
        self.X_train_poly_norm, self.mean, self.std = self.z_score_normalization(self.X_train_poly)
        self.X_test_poly_norm = (self.X_test_poly - self.mean) / self.std
        self.X_train_poly_norm = np.hstack((self.X_train_poly_norm, np.ones((self.X_train_poly.shape[0], 1))))
        self.X_test_poly_norm = np.hstack((self.X_test_poly_norm, np.ones((self.X_test_poly.shape[0], 1))))
        self.theta = np.random.randn(self.X_train_poly_norm.shape[1], 1)

    def z_score_normalization(self, data):
        mean = np.mean(data, axis=0)
        std = np.std(data, axis=0)
        data_normalized = (data - mean) / std
        return data_normalized, mean, std

    def train_model(self, learning_rate=0.05, n_iterations=10000):
        self.theta_final, self.cost_history = self.gradient_descent(self.X_train_poly_norm, self.y_train, self.theta, learning_rate, n_iterations)

    def model(self, X, theta):
        return X.dot(theta)

    def cost_function(self, X, y, theta):
        m = len(y)
        return 1 / (2 * m) * np.sum((self.model(X, theta) - y) ** 2)

    def grad(self, X, y, theta):
        m = len(y)
        return 1 / m * X.T.dot(self.model(X, theta) - y)

    def gradient_descent(self, X, y, theta, learning_rate, n_iterations):
        cost_history = np.zeros(n_iterations)
        for i in range(n_iterations):
            theta -= learning_rate * self.grad(X, y, theta)
            cost_history[i] = self.cost_function(X, y, theta)
        return theta, cost_history

    def plot_learning_curve(self):
        plt.plot(range(len(self.cost_history)), self.cost_history)
        plt.xlabel('Iterations')
        plt.ylabel('Cost')
        plt.title('Learning Curve')
        plt.show()

    def evaluate_model(self):
        def coef_determination(y, pred):
            u = ((y - pred) ** 2).sum()
            v = ((y - y.mean()) ** 2).sum()
            return 1 - u / v

        self.cd_train = coef_determination(self.y_train, self.model(self.X_train_poly_norm, self.theta_final))
        self.cd_test = coef_determination(self.y_test, self.model(self.X_test_poly_norm, self.theta_final))

        print("Coefficient de détermination de training set =", self.cd_train)
        print("Coefficient de détermination de test set =", self.cd_test)

    def l1_regularization(self, lambda_, learning_rate=0.05, n_iterations=10000):
        def cost_function_L1(X, y, theta, lambda_):
            m = len(y)
            error = self.model(X, theta) - y
            regularization_term = lambda_ * np.sum(np.abs(theta))
            cost = (1 / (2 * m)) * np.sum(error ** 2) + regularization_term
            return cost

        def grad_L1(X, y, theta, lambda_):
            m = len(y)
            error = self.model(X, theta) - y
            regularization_term = lambda_ * np.sign(theta)
            grad = (1 / m) * X.T.dot(error) + regularization_term
            return grad

        def gradient_descent_L1(X, y, theta, lambda_, learning_rate, n_iterations):
            cost_history = np.zeros(n_iterations)
            for i in range(n_iterations):
                theta -= learning_rate * grad_L1(X, y, theta, lambda_)
                cost_history[i] = cost_function_L1(X, y, theta, lambda_)
            return theta, cost_history

        self.theta_final_L1, self.cost_history_L1 = gradient_descent_L1(self.X_train_poly_norm, self.y_train, self.theta.copy(), lambda_, learning_rate, n_iterations)

    def l2_regularization(self, lambda_, learning_rate=0.05, n_iterations=10000):
        def cost_function_L2(X, y, theta, lambda_):
            m = len(y)
            error = self.model(X, theta) - y
            regularization_term = lambda_ * np.sum(theta ** 2)
            cost = (1 / (2 * m)) * np.sum(error ** 2) + regularization_term
            return cost

        def grad_L2(X, y, theta, lambda_):
            m = len(y)
            error = self.model(X, theta) - y
            regularization_term = lambda_ * theta
            grad = (1 / m) * X.T.dot(error) + regularization_term
            return grad

        def gradient_descent_L2(X, y, theta, lambda_, learning_rate, n_iterations):
            cost_history = np.zeros(n_iterations)
            for i in range(n_iterations):
                theta -= learning_rate * grad_L2(X, y, theta, lambda_)
                cost_history[i] = cost_function_L2(X, y, theta, lambda_)
            return theta, cost_history

        self.theta_final_L2, self.cost_history_L2 = gradient_descent_L2(self.X_train_poly_norm, self.y_train, self.theta.copy(), lambda_, learning_rate, n_iterations)

    def compare_regularizations(self):
        def coef_determination(y, pred):
            u = ((y - pred) ** 2).sum()
            v = ((y - y.mean()) ** 2).sum()
            return 1 - u / v

        cd_train_L1 = coef_determination(self.y_train, self.model(self.X_train_poly_norm, self.theta_final_L1))
        cd_test_L1 = coef_determination(self.y_test, self.model(self.X_test_poly_norm, self.theta_final_L1))
        cd_train_L2 = coef_determination(self.y_train, self.model(self.X_train_poly_norm, self.theta_final_L2))
        cd_test_L2 = coef_determination(self.y_test, self.model(self.X_test_poly_norm, self.theta_final_L2))

        print("L1 Regularization - Coefficient de détermination de training set =", cd_train_L1)
        print("L1 Regularization - Coefficient de détermination de test set =", cd_test_L1)
        print("L2 Regularization - Coefficient de détermination de training set =", cd_train_L2)
        print("L2 Regularization - Coefficient de détermination de test set =", cd_test_L2)


# Example usage:
if __name__ == "__main__":
    # Instantiate the PolynomialRegressionModel class
    model = PolynomialRegressionModel(data_path="mouse_viral_study.csv", target_col="Med_2_mL", feature_col="Med_1_mL")

    # Step 1: Load and preprocess data
    model.load_data()

    # Step 2: Plot correlation matrix
    model.plot_corr_matrix()

    # Step 3: Plot data
    model.plot_data()

    # Step 4: Select best polynomial degree
    model.select_best_polynomial_degree()

    # Step 5: Prepare polynomial features
    model.prepare_polynomial_features(degree=model.best_degree)

    # Step 6: Train model
    model.train_model()

    # Step 7: Plot learning curve
    model.plot_learning_curve()

    # Step 8: Evaluate model
    model.evaluate_model()

    # Step 9: Apply L1 regularization
    model.l1_regularization(lambda_=0.017)

    # Step 10: Apply L2 regularization
    model.l2_regularization(lambda_=0.017)

    # Step 11: Compare regularizations
    model.compare_regularizations()


## Logistic Reg

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def initialization(X):
    W = np.random.randn(X.shape[1], 1)
    b = np.random.randn(1)
    return W, b

def model(X, W, b):
    Z = X.dot(W) + b
    A = sigmoid(Z)
    return A

def log_loss(y, A):
    return 1/len(y) * np.sum(-y * np.log(A) - (1 - y) * np.log(1 - A))

def gradients(X, A, y):
    dW = 1/len(y) * np.dot(X.T, A - y)
    db = 1/len(y) * np.sum(A - y)
    return dW, db

def optimization(X, W, b, A, y, learning_rate):
    dW, db = gradients(X, A, y)
    W = W - learning_rate * dW
    b = b - learning_rate * db
    return W, b

def predict(X, W, b):
    A = model(X, W, b)
    return A >= 0.5

def logistic_regression(X, y, learning_rate=0.1, n_iter=10000):
    W, b = initialization(X)
    loss_history = []
    for i in range(n_iter):
        A = model(X, W, b)
        loss_history.append(log_loss(y, A))
        W, b = optimization(X, W, b, A, y, learning_rate)
    plt.plot(loss_history)
    plt.xlabel('n_iteration')
    plt.ylabel('Log_loss')
    plt.title('Evolution des erreurs')
    return W, b

# Example usage:
if __name__ == "__main__":
    # Load dataset
    dataset = pd.read_csv("../mouse_viral_study.csv")
    X = dataset[['Med_1_mL', 'Med_2_mL']].values
    y = dataset[['Virus Present']].values

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train logistic regression model
    W, b = logistic_regression(X_train, y_train, learning_rate=0.1, n_iter=10000)

    # Evaluate model
    y_pred_test = predict(X_test, W, b)
    accuracy = accuracy_score(y_test, y_pred_test)
    print("Accuracy =", accuracy)

    # Plot decision boundary
    fig, ax = plt.subplots(figsize=(9, 6))
    ax.scatter(X_train[:,0], X_train[:, 1], c=y_train, cmap='winter')
    x1 = np.linspace(0, 10, 200)
    x2 = (-W[0] * x1 - b) / W[1]
    ax.plot(x1, x2, c='orange', lw=3)
    plt.show()

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred_test)
    ax = plt.axes()
    sns.heatmap(cm, annot=True, annot_kws={"size": 30}, fmt='d', cmap="Blues", ax=ax)
    ax.set_title('Confusion Matrix')
    plt.show()


## KNN

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

class CustomLogisticRegression:
    def __init__(self):
        pass
    
    def mapFeature(self, X1, X2, degree):
        res = np.ones(X1.shape[0])
        for i in range(1, degree + 1):
            for j in range(0, i + 1):
                res = np.column_stack((res, (X1 ** (i - j)) * (X2 ** j)))
        return res
    
    def fit(self, X_train, y_train, degree=1):
        self.scaler = StandardScaler()
        X_train_normalized = self.scaler.fit_transform(X_train)
        Poly_X_train = self.mapFeature(X_train_normalized[:, 0], X_train_normalized[:, 1], degree)
        self.model = LogisticRegression(max_iter=10000)
        self.model.fit(Poly_X_train, y_train)
    
    def predict(self, X_test):
        X_test_normalized = self.scaler.transform(X_test)
        Poly_X_test = self.mapFeature(X_test_normalized[:, 0], X_test_normalized[:, 1], degree)
        return self.model.predict(Poly_X_test)
    
    def evaluate(self, X_test, y_test):
        y_pred = self.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print("Accuracy:", np.round(accuracy * 100), "%")
        cm = confusion_matrix(y_test, y_pred)
        plt.figure()
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.show()

# Example usage:
if __name__ == "__main__":
    # Load dataset
    dataset = pd.read_csv("./iris/iris.csv")
    X = dataset[['petal_length', 'petal_width']].values
    y = dataset[['class']].values

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the model
    model = CustomLogisticRegression()
    model.fit(X_train, y_train, degree=5)

    # Evaluate the model
    model.evaluate(X_test, y_test)
