In [2]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y= make_classification(
    n_features= 10,
    n_samples= 1000,
    n_informative= 8,
    n_redundant= 2,
    n_repeated=0,
    n_classes= 2,
    weights= [0.9, 0.1],
    random_state= 42
)

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.25, random_state= 42)

In [3]:
from collections import Counter

Counter(y)

Counter({np.int64(0): 897, np.int64(1): 103})

In [5]:
from sklearn.model_selection import KFold

kf= KFold(n_splits= 5, shuffle= True, random_state= 42)

for train_idx, test_idx in kf.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test= y[train_idx], y[test_idx]
    print(Counter(y_test))

Counter({np.int64(0): 177, np.int64(1): 23})
Counter({np.int64(0): 179, np.int64(1): 21})
Counter({np.int64(0): 183, np.int64(1): 17})
Counter({np.int64(0): 181, np.int64(1): 19})
Counter({np.int64(0): 177, np.int64(1): 23})


In [8]:
#Creates better balance of classes in folds
from sklearn.model_selection import StratifiedKFold

skf= StratifiedKFold(n_splits= 5, shuffle= True, random_state= 42)

for train_idx, test_idx in skf.split(X, y):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test= y[train_idx], y[test_idx]
    print(Counter(y_test))

Counter({np.int64(0): 180, np.int64(1): 20})
Counter({np.int64(0): 180, np.int64(1): 20})
Counter({np.int64(0): 179, np.int64(1): 21})
Counter({np.int64(0): 179, np.int64(1): 21})
Counter({np.int64(0): 179, np.int64(1): 21})


In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

score_lr= cross_val_score(LogisticRegression(), X, y, cv= skf)
np.average(score_lr)

np.float64(0.9019999999999999)

In [13]:
from sklearn.tree import DecisionTreeClassifier

score_dt= cross_val_score(DecisionTreeClassifier(), X, y, cv= skf)
np.average(score_dt)

np.float64(0.891)

In [15]:
from sklearn.ensemble import RandomForestClassifier

score_rf= cross_val_score(RandomForestClassifier(), X, y, cv= skf)
np.average(score_rf)

np.float64(0.913)