In [114]:
from sklearn.datasets import load_wine

wine = load_wine(as_frame=True)

In [116]:
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

:Number of Instances: 178
:Number of Attributes: 13 numeric, predictive attributes and the class
:Attribute Information:
    - Alcohol
    - Malic acid
    - Ash
    - Alcalinity of ash
    - Magnesium
    - Total phenols
    - Flavanoids
    - Nonflavanoid phenols
    - Proanthocyanins
    - Color intensity
    - Hue
    - OD280/OD315 of diluted wines
    - Proline
    - class:
        - class_0
        - class_1
        - class_2

:Summary Statistics:

                                Min   Max   Mean     SD
Alcohol:                      11.0  14.8    13.0   0.8
Malic Acid:                   0.74  5.80    2.34  1.12
Ash:                          1.36  3.23    2.36  0.27
Alcalinity of Ash:            10.6  30.0    19.5   3.3
Magnesium:                    70.0 162.0    99.7  14.3
Total Phenols:                0.98  3.88    2.29  0.63
Flavanoids:                   0.34  5.08    2.03  1.00

In [118]:
print(wine.keys())


dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])


In [120]:
print(wine.feature_names)


['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [122]:
print(wine.target_names)


['class_0' 'class_1' 'class_2']


In [124]:
print(wine.target[:5])  # Numerical class labels


0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int32


In [144]:
from sklearn.model_selection import train_test_split

X = wine.data
y = wine.target

#X_train, X_test, y_train, y_test = train_test_split(
#    X, y, test_size=0.2, random_state=42
#)


X_train, X_test, y_train, y_test = train_test_split(
    wine.data, wine.target, random_state=42)


In [146]:
X_train.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
100,12.08,2.08,1.7,17.5,97.0,2.23,2.17,0.26,1.4,3.3,1.27,2.96,710.0
122,12.42,4.43,2.73,26.5,102.0,2.2,2.13,0.43,1.71,2.08,0.92,3.12,365.0
154,12.58,1.29,2.1,20.0,103.0,1.48,0.58,0.53,1.4,7.6,0.58,1.55,640.0
51,13.83,1.65,2.6,17.2,94.0,2.45,2.99,0.22,2.29,5.6,1.24,3.37,1265.0


In [148]:
y_train.head(10)

2      0
100    1
122    1
154    2
51     0
76     1
56     0
26     0
153    2
138    2
Name: target, dtype: int32

In [150]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline

clf = make_pipeline(StandardScaler(), LinearSVC(dual=False, C=1, max_iter=1000000, random_state=42))
clf.fit(X_train, y_train)


In [152]:
from sklearn.model_selection import cross_val_score



cross_val_score(clf, X_train, y_train).mean()

0.9774928774928775

In [154]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

param_distrib = {
    "linearsvc__C": loguniform(1e-3, 1e3),
}

rnd_search = RandomizedSearchCV(
    clf,                     # your pipeline
    param_distributions=param_distrib,
    n_iter=100,               # test 30 random values of C
    cv=5,                    # 5-fold cross-validation
    scoring="accuracy",      # metric to optimize
    random_state=42,
    n_jobs=-1                # use all available cores
)

rnd_search.fit(X_train, y_train)

rnd_search.best_estimator_
#print("Best params:", rnd_search.best_params_)
#print("Best CV score:", rnd_search.best_score_)


In [156]:
#print("Best params:", rnd_search.best_params_)
print("Best CV score:", rnd_search.best_score_)

Best CV score: 0.9849002849002849


In [159]:
rnd_search.score(X_test, y_test)

0.9777777777777777