# Étude des données WINE

Dans ce notebook, vous trouverez les prévisions réalisées pour les données sur le vin. 

Voici les étapes de la procédure : 
- importer les librairies
- importer les données 
- nettoyage des données
    - traitement des doublons 
    - vérification de la présence de valeurs manquantes ou non 
    - vérification de l'influence des valeurs aberrantes
- analyse descriptive
    - étude des corrélations 
    - étude de la répartition des fleurs
- Division des données en deux échantillons + normalisation 
- Modélisation avec les classifiers 
- Prévisions

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

# librairie pour le preprocessing
from sklearn.preprocessing import StandardScaler

# librairie pour la modélisation
from sklearn.datasets import load_wine # Iris data
from sklearn.model_selection import train_test_split # split into two sample : training and test
from sklearn.model_selection import cross_val_score # split into two sample : validation and training
from sklearn.linear_model import LogisticRegression # Logistic Regression
from sklearn.ensemble import RandomForestClassifier # RandomForest Classifier
from sklearn.svm import SVC, LinearSVC # SVC
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.naive_bayes import GaussianNB # GaussianNB
from sklearn.linear_model import Perceptron # Perceptron
from sklearn.linear_model import SGDClassifier # SGD
from sklearn.tree import DecisionTreeClassifier # Tree Decision

# librairie pour évaluer
from sklearn.metrics import accuracy_score
import optuna

# librairie pour les visualisations
import matplotlib.pyplot as plt  

# librairie pour faire de l'industrialisation
import joblib

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Charger les données iris 
wine = load_wine()
# Créer un DataFrame pandas à partir des données et des noms des colonnes
data_wine = pd.DataFrame(data=wine.data, columns=wine.feature_names) # features
data_wine["target"] = wine.target
# Afficher les 10 premières lignes des données iris
data_wine.head(20)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
5,14.2,1.76,2.45,15.2,112.0,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450.0,0
6,14.39,1.87,2.45,14.6,96.0,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290.0,0
7,14.06,2.15,2.61,17.6,121.0,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295.0,0
8,14.83,1.64,2.17,14.0,97.0,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045.0,0
9,13.86,1.35,2.27,16.0,98.0,2.98,3.15,0.22,1.85,7.22,1.01,3.55,1045.0,0


In [10]:
print(data_wine.count())
print(data_wine.shape)

alcohol                         178
malic_acid                      178
ash                             178
alcalinity_of_ash               178
magnesium                       178
total_phenols                   178
flavanoids                      178
nonflavanoid_phenols            178
proanthocyanins                 178
color_intensity                 178
hue                             178
od280/od315_of_diluted_wines    178
proline                         178
target                          178
dtype: int64
(178, 14)


In [8]:
# Calcule de la moyenne de chaque colonne
data_wine.drop(["target"], axis=1).mean(numeric_only = True)

alcohol                          13.000618
malic_acid                        2.336348
ash                               2.366517
alcalinity_of_ash                19.494944
magnesium                        99.741573
total_phenols                     2.295112
flavanoids                        2.029270
nonflavanoid_phenols              0.361854
proanthocyanins                   1.590899
color_intensity                   5.058090
hue                               0.957449
od280/od315_of_diluted_wines      2.611685
proline                         746.893258
dtype: float64

In [9]:
# description statistiques (médiane, 1er et 3ème quartile)
data_wine.drop(["target"], axis=1).quantile(q = [0.25,0.5,0.75], numeric_only = True)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0.1,11.933,1.247,2.0,16.0,85.0,1.471,0.607,0.217,0.854,2.549,0.61,1.58,406.7
0.25,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
0.5,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
0.75,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
0.9,14.1,3.983,2.7,24.0,118.0,3.044,3.233,0.53,2.305,8.53,1.233,3.456,1261.5
