# **Cross-Validation**

**By Philip Blumin and Paul Cucchiara**

# **Library Imports**

In [None]:
import pandas as pd
import numpy as np
import heapq
from numpy import random
from sklearn.neighbors import KNeighborsClassifier
import scipy.stats
from sklearn import metrics

# **Method 1 (Wrong method)**

In the wrong method, we made random data turning everything into 1s and 0s. We then selected the features based on their correlations to the labels. After the best 100 features were selected, a new feature matrix was created, we broke down the features into 5 different folds, where each fold was a new matrix of 10 samples. Lastly, the K-nearest neighbor was computed for each fold for 50 different datasets. The average accuracy of the folding models are outputted below.

In [None]:
#Wrong Method

badclassifier = KNeighborsClassifier(n_neighbors = 1, metric = 'minkowski', p = 2)

#--------Creating data--------------#

tests = []
for times in range(0,50):
  allData = np.random.random((50,5001)) # data
  for i in range(0,len(allData)):
    for j in range(0,len(allData[0])):
      if allData[i][j] > 0.5:
        allData[i][j] = 1
      else:
        allData[i][j] = 0

  featureMatrix = allData[:,0:allData.shape[1]-1] # input
  labelMatrix = allData[:,[allData.shape[1]-1]] # output
  labelMatrix = np.reshape(labelMatrix, len(labelMatrix))

  
  wrongModelAccuracy = []
  correlationvals = []

  #--------Getting 100 best features--------------#

  labelMatrix = pd.Series(labelMatrix)
  for i in range (0,len(featureMatrix[0])):
    column = pd.Series(featureMatrix[:,i])
    correlationvals.append(column.corr(labelMatrix))

  bestFeatures = heapq.nlargest(100, range(len(correlationvals)), correlationvals.__getitem__)

  newFeatures = []
  for p in range(0,len(bestFeatures)):
    newFeatures.append(featureMatrix[:,bestFeatures[p]])
  newFeatures = np.array(newFeatures)
  newFeatures = np.transpose(newFeatures)

  #--------Splitting data for folds--------------#

  firstFold = newFeatures[0:10,:]
  secondFold = newFeatures[10:20,:]
  thirdFold = newFeatures[20:30,:]
  fourthFold = newFeatures[30:40,:]
  fifthFold = newFeatures[40:50,:]

  firstLabels = labelMatrix[0:10]
  secondLabels = labelMatrix[10:20]
  thirdLabels = labelMatrix[20:30]
  fourthLabels = labelMatrix[30:40]
  fifthLabels = labelMatrix[40:50] 

  #--------K-folds--------------# 

  for fold in range(1,6):
    if fold != 1:
      badclassifier.fit(firstFold, firstLabels.ravel())
    if fold != 2: 
      badclassifier.fit(secondFold, secondLabels.ravel())
    if fold != 3:
      badclassifier.fit(thirdFold, thirdLabels.ravel())
    if fold != 4:
      badclassifier.fit(fourthFold, fourthLabels.ravel())
    if fold != 5:
      badclassifier.fit(fifthFold,fifthLabels.ravel())
    
    if fold == 1:
      ypred = badclassifier.predict(firstFold)
      wrongModelAccuracy.append(metrics.accuracy_score(firstLabels, ypred))
    if fold == 2:
      ypred = badclassifier.predict(secondFold)
      wrongModelAccuracy.append(metrics.accuracy_score(secondLabels, ypred))
    if fold == 3:
      ypred = badclassifier.predict(thirdFold)
      wrongModelAccuracy.append(metrics.accuracy_score(thirdLabels, ypred))
    if fold == 4:
      ypred = badclassifier.predict(fourthFold)
      wrongModelAccuracy.append(metrics.accuracy_score(fourthLabels, ypred))
    if fold == 5:
      ypred = badclassifier.predict(fifthFold)
      wrongModelAccuracy.append(metrics.accuracy_score(fifthLabels, ypred))
  tests.append(wrongModelAccuracy)

  
print("Average Accuracy: ")
print(np.sum(tests)/250)

Average Accuracy: 
0.9612


As we can see, the error for the k-folding is a miniscule 3%. This is far lower than the standard 50% that the data should give us and it concerns us that the models have been overfitted.

# **Method 2 (correct way)**

The process for getting correct method is identical to the incorrect method with the exception that the whole matrix for features were used.


In [None]:
# Correct Kfolding

classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
modelAccuracy = []
tests = []

#--------Creating data--------------#

for k in range(0,50):
  allData = np.random.random((60,5001)) # data
  for i in range(0,len(allData)):
    for j in range(0,len(allData[0])):
      if allData[i][j] > 0.5:
        allData[i][j] = 1
      else:
        allData[i][j] = 0

  #--------Splitting data-------------#

  featureMatrix = allData[:,0:allData.shape[1]-1] # input
  labelMatrix = allData[:,[allData.shape[1]-1]] # output

  firstFold = featureMatrix[0:10,:]
  secondFold = featureMatrix[10:20,:]
  thirdFold = featureMatrix[20:30,:]
  fourthFold = featureMatrix[30:40,:]
  fifthFold = featureMatrix[40:50,:]

  firstLabels = labelMatrix[0:10]
  secondLabels = labelMatrix[10:20]
  thirdLabels = labelMatrix[20:30]
  fourthLabels = labelMatrix[30:40]
  fifthLabels = labelMatrix[40:50]

  #--------k-folds--------------#

  for fold in range(1,6):
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    if fold != 1:
      classifier.fit(firstFold, firstLabels.ravel())
    if fold != 2: 
      classifier.fit(secondFold, secondLabels.ravel())
    if fold != 3:
      classifier.fit(thirdFold, thirdLabels.ravel())
    if fold != 4:
      classifier.fit(fourthFold, fourthLabels.ravel())
    if fold != 5:
      classifier.fit(fifthFold,fifthLabels.ravel())
    
    if fold == 1:
      ypred = classifier.predict(firstFold)
      modelAccuracy.append(metrics.accuracy_score(firstLabels, ypred))
    if fold == 2:
      ypred = classifier.predict(secondFold)
      modelAccuracy.append(metrics.accuracy_score(secondLabels, ypred))
    if fold == 3:
      ypred = classifier.predict(thirdFold)
      modelAccuracy.append(metrics.accuracy_score(thirdLabels, ypred))
    if fold == 4:
      ypred = classifier.predict(fourthFold)
      modelAccuracy.append(metrics.accuracy_score(fourthLabels, ypred))
    if fold == 5:
      ypred = classifier.predict(fifthFold)
      modelAccuracy.append(metrics.accuracy_score(fifthLabels, ypred))
  tests.append(modelAccuracy)

  modelAccuracy = []

print("Average Accuracy: ")
print(np.sum(tests)/250)


Average Accuracy: 
0.4988


Using the correct approach for k-folding, all features are used to form models using K-nearest neighbors. The average accuracy of all of the models is the expected 50% indicating that the models are good.