### Importing the Required Libraries

In [2]:
import pandas as pd
import numpy as np

### Importing Dataset

In [6]:
dataset=pd.read_csv("Desktop/winequality-red.csv",sep=';')#sep =pandas is able to properly parse the file.
dataset.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


### Data Preprocessing

In [7]:
x=dataset.iloc[:,0:11].values
y=dataset.iloc[:,11].values

In [8]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0,random_state=0)
#test_size =0 will return all the data in the training set 

In [10]:
x_test

array([], shape=(0, 11), dtype=float64)

###  Scaling the Data

In [11]:
from sklearn.preprocessing import StandardScaler
feature_scaler=StandardScaler()
x_train=feature_scaler.fit_transform(x_train)
#x_test=feature_scaler.transform(x_test)


### Training and Cross Validation 

In [12]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier(n_estimators=300,random_state=0)

In [17]:
from sklearn.model_selection import cross_val_score
all_accuracies=cross_val_score(estimator=classifier,scoring='accuracy',X=x_train,y=y_train,cv=5)
print(all_accuracies)

[0.71428571 0.68535826 0.70716511 0.68238994 0.68454259]


In [18]:
print(all_accuracies.mean())

0.6947483205258805


In [19]:
print(all_accuracies.std())

0.013273630675169781


### A machine learning model has two types of parameters. The first type of parameters are the parameters that are learned through a machine learning model while the second type of parameters are the hyper parameter that we pass to the machine learning model.

### Instead of randomly selecting the values of the parameters, a better approach would be to develop an algorithm which automatically finds the best parameters for a particular model. Grid Search is one such algorithm.

### The Grid Search algorithm can be very slow, owing to the potentially huge number of combinations to test. Furthermore, cross validation further increases the execution time and complexity.

### Grid Search with Scikit-Learn

In [21]:
from sklearn.model_selection import GridSearchCV
grid_param= {
    'n_estimators': [100, 300, 500, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

In [22]:
gd_sr = GridSearchCV(estimator=classifier,
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)

In [24]:
gd_sr.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [100, 300, 500, 800, 1000], 'criterion': ['gini', 'entropy'], 'bootstrap': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [25]:
best_parameters = gd_sr.best_params_
print(best_parameters)

{'bootstrap': True, 'criterion': 'gini', 'n_estimators': 1000}


In [26]:
best_result = gd_sr.best_score_
print(best_result)

0.6979362101313321


### To identify the best algorithm and best parameters, we can use the Grid Search algorithm.