# k-folds Cross Validation

In [27]:
# Loading our dataset
#
wine = pd.read_csv("winequality-red.csv")
wine.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [0]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [25]:
from sklearn.model_selection import KFold

X = wine[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',\
          'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',\
          'pH', 'sulphates', 'alcohol']].values
y = wine['quality'].values

folds = KFold(n_splits=10)

# Get number of folds
#
print('we are using ' +str(folds.get_n_splits(X)) + ' folds')

# Keeping track of the RMSE
RMSES = []
count = 1

for train_index, test_index in folds.split(X):
  print('\nTraining model ' + str(count))

  # set up the train and test based on the split determined by KFold
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]

  # Fit our model
  regressor = LinearRegression()
  regressor.fit(X_train, y_train)

  # Predict and Check accuracy of the model
  y_pred = regressor.predict(X_test)

  rmse_value = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
  RMSES.append(rmse_value)

  print('Model ' + str(count) + ' Root Mean Squared Error:',rmse_value)
  count = count + 1

we are using 10 folds

Training model 1
Model 1 Root Mean Squared Error: 0.6862481195221853

Training model 2
Model 2 Root Mean Squared Error: 0.6324703434524646

Training model 3
Model 3 Root Mean Squared Error: 0.6850468988873284

Training model 4
Model 4 Root Mean Squared Error: 0.6570566965861605

Training model 5
Model 5 Root Mean Squared Error: 0.6137235391805054

Training model 6
Model 6 Root Mean Squared Error: 0.7236010030500887

Training model 7
Model 7 Root Mean Squared Error: 0.632819730557344

Training model 8
Model 8 Root Mean Squared Error: 0.6553239884798194

Training model 9
Model 9 Root Mean Squared Error: 0.6014941518907087

Training model 10
Model 10 Root Mean Squared Error: 0.698513075772105


# Leave One Out Cross Validation

In [28]:
from sklearn.model_selection import LeaveOneOut

X = wine[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',\
          'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',\
          'pH', 'sulphates', 'alcohol']].values
y = wine['quality'].values

folds = LeaveOneOut()

# Get number of folds
#
print('we are using ' +str(folds.get_n_splits(X)) + ' folds')

# Keeping track of the RMSE
RMSES = []
count = 0

for train_index, test_index in folds.split(X):

   # set up the train and test based on the split determined by KFold
   X_train, X_test = X[train_index], X[test_index]
   y_train, y_test = y[train_index], y[test_index]

   # Fit our model
   regressor = LinearRegression()
   regressor.fit(X_train, y_train)

   # Predict and Check accuracy of the model
   y_pred = regressor.predict(X_test)

   rmse_value = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
   RMSES.append(rmse_value)

   count += 1

print('trained ' +str(count) + ' models\n')
print(f"Mean RMSES: {np.mean(RMSES)}")

we are using 1599 folds
trained 1599 models

Mean RMSES: 0.5046835224228031
