# Ressources

In [199]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline


# Fonctions
## Selection numérique

In [200]:
def df_filtre_numerical(df:pd.DataFrame):
    """Identifie dans un DataFrame les colonnes numériques.

    Parameters
    -----------
    df: :class:`DataFrame`
            Le DataFrame
            
    Return
    ----------
    Liste des colonnes numériques
    
    Exemple
    ----------
    numerical_columns = df_filtre_numerical(df)
    """
    numerical_columns = df.select_dtypes(['int64', 'float64']).columns

    return numerical_columns

## Standardisation

In [201]:
def df_scaling_numeric(df:pd.DataFrame, numerical_columns:list):
    """Standardise et normalise les données numériques avec un standardScaler

    Parameters
    -----------
    df: :class:`DataFrame`
            Le DataFrame
    numerical_columns: :class:`Liste`
            Les colonnes à standardiser
            
    Return
    ----------
    Dataframe avec les colonnes standardisées
    
    Exemple
    ----------
    df[numerical_columns] = df_scaling_numeric(df, numerical_columns)
    """
    ss = StandardScaler()
    df[numerical_columns] = ss.fit_transform(df[numerical_columns])
    
    # exemple : df[numerical_columns] = df_scaling_numeric(df, numerical_columns)

    return df[numerical_columns]

# Dataset

In [202]:
iris = pd.read_csv("iris.csv")


In [203]:
iris

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


## Variables numérique

In [204]:
var_numerique = df_filtre_numerical(iris) 
var_numerique

Index(['sepallength', 'sepalwidth', 'petallength', 'petalwidth'], dtype='object')

In [205]:
iris_pipeline = iris.copy() # pour entrainement avec pipeline

### Standardisation

In [206]:
iris[var_numerique] = df_scaling_numeric(iris, var_numerique)

In [207]:
iris

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,-0.900681,1.032057,-1.341272,-1.312977,Iris-setosa
1,-1.143017,-0.124958,-1.341272,-1.312977,Iris-setosa
2,-1.385353,0.337848,-1.398138,-1.312977,Iris-setosa
3,-1.506521,0.106445,-1.284407,-1.312977,Iris-setosa
4,-1.021849,1.263460,-1.341272,-1.312977,Iris-setosa
...,...,...,...,...,...
145,1.038005,-0.124958,0.819624,1.447956,Iris-virginica
146,0.553333,-1.281972,0.705893,0.922064,Iris-virginica
147,0.795669,-0.124958,0.819624,1.053537,Iris-virginica
148,0.432165,0.800654,0.933356,1.447956,Iris-virginica


## Variables target

In [208]:
target = iris['class']

# Prédiction

## Séparation dataset

In [209]:
X_train, X_test, y_train, y_test = train_test_split(iris.drop(['class'], axis=1),
                                                    target,
                                                    test_size=0.33,
                                                    random_state=0) 

<center><img src="scikit-learn choisir algo.png" width="700" height="400" /></center>

On va tester un SDG classifier

### Entrainement sans pipeline

In [210]:
SDG = SGDClassifier(max_iter=1000,  tol=1e-3)
model = SDG.fit(X_train, y_train)


#### Score

In [211]:
score = model.score(X_test, y_test)

score

0.94

#### Cross validation

In [212]:
cross_validation = cross_val_score(SDG, X_test, y_test, cv=5).mean()
cross_validation

0.86

#### Prediction

In [213]:
iris['predict'] = model.predict(iris.drop(['class'], axis=1))
iris

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class,predict
0,-0.900681,1.032057,-1.341272,-1.312977,Iris-setosa,Iris-setosa
1,-1.143017,-0.124958,-1.341272,-1.312977,Iris-setosa,Iris-setosa
2,-1.385353,0.337848,-1.398138,-1.312977,Iris-setosa,Iris-setosa
3,-1.506521,0.106445,-1.284407,-1.312977,Iris-setosa,Iris-setosa
4,-1.021849,1.263460,-1.341272,-1.312977,Iris-setosa,Iris-setosa
...,...,...,...,...,...,...
145,1.038005,-0.124958,0.819624,1.447956,Iris-virginica,Iris-virginica
146,0.553333,-1.281972,0.705893,0.922064,Iris-virginica,Iris-virginica
147,0.795669,-0.124958,0.819624,1.053537,Iris-virginica,Iris-virginica
148,0.432165,0.800654,0.933356,1.447956,Iris-virginica,Iris-virginica


### Entrainement pipeline

#### Séparation

In [214]:
X_train, X_test, y_train, y_test = train_test_split(iris_pipeline.drop(['class'], axis=1),
                                                    target,
                                                    test_size=0.33,
                                                    random_state=0) 

In [215]:
# Always scale the input. The most convenient way is to use a pipeline.

clf = make_pipeline(StandardScaler(),
                    SGDClassifier(max_iter=1000, tol=1e-3))

clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdclassifier', SGDClassifier())])

In [216]:
# dans le cas de traitement différent par colonne :

# from sklearn.compose import make_column_transformer

# transformer = make_column_transformer((StandardScaler(), ['sepallength', 'sepalwidth']),
#                                       (SimpleImputer(strategy= 'most_frequent'), [...]))

# clf = make_pipeline(transformer,
#                     SGDClassifier(max_iter=1000, tol=1e-3))

# clf.fit(X_train, y_train)



In [217]:
# encore plus avancé :

# numerical_features = ['sepallength', 'sepalwidth']
# categorical_features = [...]


# numerical_pipeline = make_pipeline(SimpleImputer(),
#                                    StandardScaler())
# categorical_pipeline = make_pipeline(SimpleImputer(strategy = 'most_frequent'),
#                                    OneHotEncoder())


# preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
#                         (categorical_pipeline, categorical_features))


# clf = make_pipeline(preprocessor, SDGClassifier())

# clf.fit(X_trai, y_train)

In [218]:
# encore plus avancé :

from sklearn.compose import make_column_selector

# numerical_features = make_column_selector(dtype_include=np.number)
# categorical_features = make_column_selector(dtype_exclude=np.number)


# numerical_pipeline = make_pipeline(SimpleImputer(),
#                                    StandardScaler())
# categorical_pipeline = make_pipeline(SimpleImputer(strategy = 'most_frequent'),
#                                    OneHotEncoder())


# preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
#                         (categorical_pipeline, categorical_features))


# clf = make_pipeline(preprocessor, SDGClassifier())

# clf.fit(X_train, y_train)

#### Score

In [219]:
clf.score(X_test, y_test)

0.96

#### Cross validation

In [220]:
cross_validation = cross_val_score(clf, X_test, y_test, cv=5).mean()
cross_validation

0.9400000000000001

In [221]:
iris_pipeline['predict'] = clf.predict(iris_pipeline.drop(['class'], axis=1))
iris_pipeline

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class,predict
0,5.1,3.5,1.4,0.2,Iris-setosa,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa,Iris-setosa
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica,Iris-virginica
