In [1]:
from google.colab import files
uploaded = files.upload()

Saving kidney_disease.csv to kidney_disease.csv


**K-fold** **Cross** **Validation**

In [7]:
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# Load dataset
file_path = "kidney_disease.csv"
df = pd.read_csv(file_path)

# Drop ID column if exists
if "id" in df.columns:
    df = df.drop(columns=["id"])

# Handle missing values
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].fillna(df[col].mode()[0])  # Fill categorical with mode
    else:
        df[col] = df[col].fillna(df[col].mean())     # Fill numeric with mean

# Encode categorical variables
for col in df.columns:
    if df[col].dtype == "object":
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

# Features and target
X = df.drop("classification", axis=1)
y = df["classification"]

# K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create pipeline: scaling + logistic regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=1000))
])

# Cross-validation
scores = cross_val_score(pipeline, X, y, cv=kf, scoring='accuracy')

print("Fold scores:", scores)
print(f"Average accuracy: {scores.mean():.2f}")

Fold scores: [0.9875 1.     0.975  0.9875 0.9625]
Average accuracy: 0.98
