<a href="https://colab.research.google.com/github/Sahil2004/cs-practicals/blob/main/6th%20sem/%5B01%5D%20Data%20Mining%20and%20Analysis/%5B12%5D%20Practical%208.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# K-Fold Cross Validation

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('/content/CO22358_lung_cancer.csv')
X = dataset.iloc[:, 0:-1].values
y = dataset.iloc[:, -1].values

## Handling the missing values

In [3]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy = 'mean', missing_values = np.nan)
imputer=imputer.fit(X[:,4:])
X[:,4:]=imputer.transform(X[:,4:])

## Encoding the non-numerical values

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

# Convert all categorical columns except the last one using LabelEncoder
labelencoder_X = LabelEncoder()
for col in range(X.shape[1] - 1):  # All columns except the last
    X[:, col] = labelencoder_X.fit_transform(X[:, col])

# Convert the last column ('YES'/'NO') to 0/1
X[:, -1] = np.where(X[:, -1] == 'YES', 1, 0).astype(int)

# Optionally, apply OneHotEncoder to the first few columns if needed (skip the last one)
ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), list(range(X.shape[1] - 1)))],
    remainder='passthrough'
)
X = ct.fit_transform(X)

# Encode target variable y
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

## Feature Scaling

In [5]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X = sc.fit_transform(X)

## Splitting the dataset into the Training set and Test set

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Training the Kernel SVM model on the Training set

In [7]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [8]:
y_pred = classifier.predict(X_test)

## Making the Confusion Matrix

In [9]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 0 10]
 [ 0 68]]


## Applying k-Fold Cross Validation

In [10]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train,
cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 86.58 %
Standard Deviation: 1.33 %


## Applying k-Fold Cross Validation with Shuffle Split

In [11]:
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=10, test_size=0.25, random_state=0)

accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train,
cv = cv)

print("Accuracy: {:.2f} %".format(accuracies.mean()*100))

print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 88.62 %
Standard Deviation: 2.58 %


## Repeated K-Fold

In [12]:
from sklearn.model_selection import RepeatedKFold

rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=42)

accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = rkf)

print("Accuracy: {:.2f} %".format(accuracies.mean()*100))

print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 87.24 %
Standard Deviation: 2.12 %


## Leave One Out k-fold

In [13]:
from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()

accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = loo)

print("Accuracy: {:.2f} %".format(accuracies.mean()*100))

print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 86.58 %
Standard Deviation: 34.09 %


## Stratified k-fold

In [14]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=3)

accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = skf)

print("Accuracy: {:.2f} %".format(accuracies.mean()*100))

print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 87.88 %
Standard Deviation: 2.21 %
