In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# 1. Load dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Create pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', SVC())
])

# 3. Define parameter grid
param_grid = {
    'pca__n_components': [2, 3],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

# 4. GridSearchCV
grid = GridSearchCV(pipe, param_grid)
grid.fit(X_train, y_train)

# 5. Results
print("Best parameters found:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))


Best parameters found: {'classifier__C': 0.1, 'classifier__kernel': 'linear', 'pca__n_components': 3}
Best cross-validation score: 0.96
Test set score: 1.00


In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import randint, uniform
import numpy as np

# Load dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define classifiers and their param grids
models_params = {
    "RandomForest": (
        RandomForestClassifier(random_state=42),
        {
            'classifier__n_estimators': randint(50, 200),
            'classifier__max_depth': randint(3, 20)
        }
    ),
    "LogisticRegression": (
        LogisticRegression(max_iter=1000),
        {
            'classifier__C': uniform(0.01, 10),
            'classifier__solver': ['liblinear', 'lbfgs']
        }
    ),
    "KNN": (
        KNeighborsClassifier(),
        {
            'classifier__n_neighbors': randint(3, 15)
        }
    ),
    "Perceptron": (
        Perceptron(max_iter=1000, random_state=42),
        {
            'classifier__penalty': [None, 'l1', 'l2'],
            'classifier__alpha': uniform(0.0001, 0.1)
        }
    )
}

# Cross-validation folds
cv_folds = [3, 5, 7]

# Loop through models and cross-validations
for model_name, (clf, param_dist) in models_params.items():
    for cv in cv_folds:
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=2)),
            ('classifier', clf)
        ])

        random_search = RandomizedSearchCV(
            pipe,
            param_distributions=param_dist,
            n_iter=5,  # you can increase if param space is large
            cv=cv,
            random_state=42,
            n_jobs=-1
        )

        random_search.fit(X_train, y_train)

        print(f"\n🔍 Model: {model_name}, CV: {cv}-fold")
        print("Best Parameters:", random_search.best_params_)
        print(f"Training Score: {random_search.best_score_:.2f}")
        print(f"Test Score: {random_search.score(X_test, y_test):.2f}")



🔍 Model: RandomForest, CV: 3-fold
Best Parameters: {'classifier__max_depth': 9, 'classifier__n_estimators': 142}
Training Score: 0.88
Test Score: 0.90

🔍 Model: RandomForest, CV: 5-fold
Best Parameters: {'classifier__max_depth': 10, 'classifier__n_estimators': 70}
Training Score: 0.91
Test Score: 0.90

🔍 Model: RandomForest, CV: 7-fold
Best Parameters: {'classifier__max_depth': 9, 'classifier__n_estimators': 142}
Training Score: 0.89
Test Score: 0.90

🔍 Model: LogisticRegression, CV: 3-fold
Best Parameters: {'classifier__C': np.float64(1.844347898661638), 'classifier__solver': 'lbfgs'}
Training Score: 0.91
Test Score: 0.90

🔍 Model: LogisticRegression, CV: 5-fold
Best Parameters: {'classifier__C': np.float64(1.844347898661638), 'classifier__solver': 'lbfgs'}
Training Score: 0.92
Test Score: 0.90

🔍 Model: LogisticRegression, CV: 7-fold
Best Parameters: {'classifier__C': np.float64(1.844347898661638), 'classifier__solver': 'lbfgs'}
Training Score: 0.92
Test Score: 0.90

🔍 Model: KNN, C

Check for 3 fold, 5 fold and 7 fold cross validation

Replace classifier, SVC with RandomForestClassifier and LogisticRegression, Perceptron, knn .

Update the param_grid accordingly (e.g., for RandomForestClassifier, use n_estimators, max_depth, etc.)

Also replace Gridsearch with randomnsearch function.

Relplace with with your own csv dataset using code below:

In [3]:
from google.colab import files
uploaded = files.upload()


Saving CC_GENERAL.csv to CC_GENERAL.csv


In [4]:
import pandas as pd
data = pd.read_csv("CC_GENERAL.csv")

In [5]:
data.head()

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12
1,C10002,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,C10003,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0,12
3,C10004,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,,0.0,12
4,C10005,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0,12


In [6]:
print(data[data.columns[-1]].value_counts())  # Look at last column


TENURE
12    7584
11     365
10     236
6      204
8      196
7      190
9      175
Name: count, dtype: int64


In [7]:
import pandas as pd

data = pd.read_csv("CC_GENERAL.csv")
X = data.drop("TENURE", axis=1)
y = data["TENURE"]


In [8]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd


In [9]:
# Step 1: Convert 'Yes'/'No' to 1/0 for the target
data['TENURE'] = data['TENURE'].map({'Yes': 1, 'No': 0})

# Step 2: One-hot encode categorical features
data_encoded = pd.get_dummies(data.drop('TENURE', axis=1))

# Step 3: Final feature and target split
X = data_encoded
y = data['TENURE']


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier

# Load your dataset
data = pd.read_csv("CC_GENERAL.csv")  # Adjust path if needed

# Remove rows with NaN in BALANCE (used for target creation)
data = data.dropna(subset=["BALANCE"])

# Create target column
median_balance = data["BALANCE"].median()
data["HighBalance"] = (data["BALANCE"] > median_balance).astype(int)

# Prepare feature matrix and fill remaining NaNs
X = data.drop(["CUST_ID", "HighBalance"], axis=1)
X = X.fillna(X.mean(numeric_only=True))

# Target
y = data["HighBalance"]

# Sanity check (optional)
print("X nulls:", X.isnull().sum().sum())
print("y nulls:", y.isnull().sum())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define classifiers and their param grids
models = {
    "RandomForest": {
        "classifier": RandomForestClassifier(),
        "param_grid": {
            "classifier__n_estimators": [50, 100, 150],
            "classifier__max_depth": [3, 5, 10, None]
        }
    },
    "LogisticRegression": {
        "classifier": LogisticRegression(max_iter=1000),
        "param_grid": {
            "classifier__C": [0.1, 1, 10],
            "classifier__solver": ['lbfgs', 'liblinear']
        }
    },
    "KNN": {
        "classifier": KNeighborsClassifier(),
        "param_grid": {
            "classifier__n_neighbors": [3, 5, 7, 9]
        }
    },
    "Perceptron": {
        "classifier": Perceptron(),
        "param_grid": {
            "classifier__penalty": [None, 'l2', 'l1', 'elasticnet'],
            "classifier__alpha": [0.0001, 0.001, 0.01]
        }
    }
}

# Try 3, 5, and 7-fold cross-validation
cv_values = [3, 5, 7]

# Loop through models and CV folds
for model_name, model_info in models.items():
    for cv in cv_values:
        print(f"\n🔍 Model: {model_name}, CV: {cv}-fold")

        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=3)),  # Optional PCA
            ('classifier', model_info['classifier'])
        ])

        search = RandomizedSearchCV(
            pipe,
            model_info['param_grid'],
            cv=cv,
            n_iter=5,
            random_state=42,
            verbose=1,
            n_jobs=-1
        )

        search.fit(X_train, y_train)

        print("Best Parameters:", search.best_params_)
        print(f"Training Score: {search.best_score_:.2f}")
        print(f"Test Score: {search.score(X_test, y_test):.2f}")


X nulls: 0
y nulls: 0

🔍 Model: RandomForest, CV: 3-fold
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Parameters: {'classifier__n_estimators': 150, 'classifier__max_depth': 10}
Training Score: 0.88
Test Score: 0.88

🔍 Model: RandomForest, CV: 5-fold
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Parameters: {'classifier__n_estimators': 150, 'classifier__max_depth': 10}
Training Score: 0.88
Test Score: 0.88

🔍 Model: RandomForest, CV: 7-fold
Fitting 7 folds for each of 5 candidates, totalling 35 fits
Best Parameters: {'classifier__n_estimators': 150, 'classifier__max_depth': 10}
Training Score: 0.88
Test Score: 0.88

🔍 Model: LogisticRegression, CV: 3-fold
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Parameters: {'classifier__solver': 'liblinear', 'classifier__C': 0.1}
Training Score: 0.84
Test Score: 0.83

🔍 Model: LogisticRegression, CV: 5-fold
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Parameters: {'class



Best Parameters: {'classifier__n_neighbors': 9}
Training Score: 0.87
Test Score: 0.87

🔍 Model: KNN, CV: 5-fold
Fitting 5 folds for each of 4 candidates, totalling 20 fits




Best Parameters: {'classifier__n_neighbors': 9}
Training Score: 0.87
Test Score: 0.87

🔍 Model: KNN, CV: 7-fold
Fitting 7 folds for each of 4 candidates, totalling 28 fits




Best Parameters: {'classifier__n_neighbors': 9}
Training Score: 0.87
Test Score: 0.87

🔍 Model: Perceptron, CV: 3-fold
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Parameters: {'classifier__penalty': 'l1', 'classifier__alpha': 0.01}
Training Score: 0.81
Test Score: 0.79

🔍 Model: Perceptron, CV: 5-fold
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Parameters: {'classifier__penalty': None, 'classifier__alpha': 0.0001}
Training Score: 0.81
Test Score: 0.81

🔍 Model: Perceptron, CV: 7-fold
Fitting 7 folds for each of 5 candidates, totalling 35 fits
Best Parameters: {'classifier__penalty': 'l1', 'classifier__alpha': 0.01}
Training Score: 0.83
Test Score: 0.79
