# Libraries 

In [1]:
# librairie de base
import pandas as pd
import numpy as np

# librairie pour le preprocessing
from sklearn.preprocessing import StandardScaler

# librairie pour la modélisation
from sklearn.datasets import load_iris # Iris data
from sklearn.model_selection import train_test_split # split into two sample : training and test
from sklearn.model_selection import cross_val_score # split into two sample : validation and training
from sklearn.linear_model import LogisticRegression # Logistic Regression

# librairie pour l'optimisation des hyperparamètres
import optuna

# Import data

In [2]:
# Charger les données iris 
iris = load_iris()
# Créer un DataFrame pandas à partir des données et des noms des colonnes
data_iris = pd.DataFrame(data=iris.data, columns=iris.feature_names) # features
data_iris["target"] = iris.target
# Afficher les 10 premières lignes des données iris
print(data_iris.head(20))

    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                 5.1               3.5                1.4               0.2   
1                 4.9               3.0                1.4               0.2   
2                 4.7               3.2                1.3               0.2   
3                 4.6               3.1                1.5               0.2   
4                 5.0               3.6                1.4               0.2   
5                 5.4               3.9                1.7               0.4   
6                 4.6               3.4                1.4               0.3   
7                 5.0               3.4                1.5               0.2   
8                 4.4               2.9                1.4               0.2   
9                 4.9               3.1                1.5               0.1   
10                5.4               3.7                1.5               0.2   
11                4.8               3.4 

# Cleaning

In [3]:
# Voir les valeurs possible de target 
print(data_iris['target'].unique())
# Ajouter le nom des fleurs à partir de la colonne target
data_iris['flower_name'] = data_iris['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

[0 1 2]


In [4]:
# Duplicated 

## Recherche des éventuelles duplicated
duplicates = data_iris[data_iris.duplicated()]
print("Doublons :")
print(duplicates)

## Suppression des duplicated
data_iris = data_iris.drop_duplicates(keep='first')

Doublons :
     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
142                5.8               2.7                5.1               1.9   

     target flower_name  
142       2   virginica  


In [5]:
# Recherche d'éventuel valeurs manquantes selon les colonnes
missing_values = data_iris.isnull().sum()
print("Valeurs manquantes :")
print(missing_values)

Valeurs manquantes :
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
flower_name          0
dtype: int64


# Pre-processing

In [6]:
X = data_iris.drop(['target', 'flower_name'], axis=1) # feature
y = data_iris['target'] # target

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

# Modeling

### Logistic Regression

In [9]:
## Logitistic Regression (penalty 'l1')

#### Define model parameter
logreg_l1 = LogisticRegression(penalty='l1', solver='liblinear') # Logistic regression with penalty 'l1' 

#### Fitting
logreg_l1.fit(X_train_scaled, y_train)

### Prediction 
y_pred_logreg_l1 = logreg_l1.predict(X_test_scaled)

### Hyperparameter turning

In [10]:
## Logitistic Regression (penalty 'elasticnet')

#### Définir la fonction objective
def objective(trial):
    l1_ratio = trial.suggest_uniform('l1_ratio', 0, 1)  # Exploration de l'hyperparamètre l1_ratio entre 0 et 1
    
    # Créer le modèle de régression logistique avec penalty='elasticnet'
    model = LogisticRegression(penalty='elasticnet', l1_ratio=l1_ratio, solver='saga')
    
    # Calculer le score de validation croisée (utilisation de la précision pour l'exemple)
    score = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy').mean()
    return score

#### Créer un objet Optuna pour l'optimisation
study = optuna.create_study(direction='maximize')  # On cherche à maximiser la précision

#### Exécuter l'optimisation
study.optimize(objective, n_trials=100)

#### Afficher les meilleurs hyperparamètres et le score associé
print("Meilleurs hyperparamètres:", study.best_params)
print("Meilleur score de précision:", study.best_value)

[I 2024-05-08 21:28:51,529] A new study created in memory with name: no-name-a097e1a0-07df-4e80-adaa-e96311e283a2
  l1_ratio = trial.suggest_uniform('l1_ratio', 0, 1)  # Exploration de l'hyperparamètre l1_ratio entre 0 et 1
[I 2024-05-08 21:28:51,550] Trial 0 finished with value: 0.9579710144927537 and parameters: {'l1_ratio': 0.5141360170651099}. Best is trial 0 with value: 0.9579710144927537.
  l1_ratio = trial.suggest_uniform('l1_ratio', 0, 1)  # Exploration de l'hyperparamètre l1_ratio entre 0 et 1
[I 2024-05-08 21:28:51,568] Trial 1 finished with value: 0.9496376811594203 and parameters: {'l1_ratio': 0.8553979522554006}. Best is trial 0 with value: 0.9579710144927537.
  l1_ratio = trial.suggest_uniform('l1_ratio', 0, 1)  # Exploration de l'hyperparamètre l1_ratio entre 0 et 1
[I 2024-05-08 21:28:51,585] Trial 2 finished with value: 0.9496376811594203 and parameters: {'l1_ratio': 0.8771624823352718}. Best is trial 0 with value: 0.9579710144927537.
  l1_ratio = trial.suggest_uniform

Meilleurs hyperparamètres: {'l1_ratio': 0.5141360170651099}
Meilleur score de précision: 0.9579710144927537


In [11]:
#### Define model parameter
logreg_elastic_net = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=study.best_params['l1_ratio']) # Logistic regression with penalty 'elacticnet'

#### Fitting
logreg_elastic_net.fit(X_train_scaled, y_train)

### Prediction 
y_pred_logreg_elastic_net = logreg_elastic_net.predict(X_test_scaled)