Importing Necessary Files

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Import models here
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

## Without cross validation
1. Train Test Solit
```python
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify = Y, random_state=3)
print(X.shape, X_train.shape, X_test.shape)
```
stratify means without it the split wont be proper. Split evenly.
2. Create List of models
```python
# list of models
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier()]
```
3. Create the compare function to compare accuracy 
```python
def compare_models_train_test():

  for model in models:

    # training the model
    model.fit(X_train, Y_train)
    
    # evaluating the model
    test_data_prediction = model.predict(X_test)

    accuracy = accuracy_score(Y_test, test_data_prediction)

    print('Accuracy score of the ', model, ' = ', accuracy)
compare_models_train_test()
```

In [2]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify = Y, random_state=3)
print(X.shape, X_train.shape, X_test.shape)

# list of models
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier()]

def compare_models_train_test():

  for model in models:

    # training the model
    model.fit(X_train, Y_train)
    
    # evaluating the model
    test_data_prediction = model.predict(X_test)

    accuracy = accuracy_score(Y_test, test_data_prediction)

    print('Accuracy score of the ', model, ' = ', accuracy)

compare_models_train_test()

NameError: name 'X' is not defined

## Cross Validation
### Cross Val Score
Use cross-validation score when you want a single performance metric that represents the average performance of your model across multiple splits of the data. Useful for a quick evaluation of the model's generalization performance.

1. Create List of Models
2. Compare Models by cross validation
    - We pass model, X, Y, and number of cross validations

In [None]:
# list of models
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier()]

def compare_models_cross_validation():

  for model in models:

    cv_score = cross_val_score(model, X,Y, cv=5)
    
    mean_accuracy = sum(cv_score)/len(cv_score)

    mean_accuracy = mean_accuracy*100

    mean_accuracy = round(mean_accuracy, 2)

    print('Cross Validation accuracies for ', model, '=  ', cv_score)
    print('Accuracy % of the ', model, mean_accuracy)
    print('----------------------------------------------')

compare_models_cross_validation()

### K Fold
- same as cross validation score. But for training and testing we need to write the code separately

In [None]:
# list of models
from sklearn.model_selection import KFold

models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier()]

# Function to compare models using K-Fold Cross-Validation
def compare_models_cross_validation(X, y, n_splits=5):
    # Initialize K-Fold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Iterate over models
    for model in models:
        model_name = model.__class__.__name__
        print(f"\n{model_name}:")

        # Initialize variables to store results
        accuracies = []

        # Iterate over K-Folds
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # Initialize and train the model
            model.fit(X_train, y_train)

            # Make predictions
            y_pred = model.predict(X_test)

            # Calculate accuracy and store the result
            accuracy = accuracy_score(y_test, y_pred)
            accuracies.append(accuracy)

        # Calculate and print average accuracy across folds
        avg_accuracy = sum(accuracies) / len(accuracies)
        print(f"Average Accuracy: {avg_accuracy:.4f}")

# Example usage:
compare_models_cross_validation(X_train, y_train)

### Stratified K Fold
Better for imbalanced Dataset

In [None]:
from sklearn.linear_model import LogisticRegression

# List of models
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier()]

# Function to compare models using Stratified K-Fold Cross-Validation
def compare_models_stratified_kfold(X, y, n_splits=5):
    # Initialize Stratified K-Fold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Iterate over models
    for model in models:
        model_name = model.__class__.__name__
        print(f"\n{model_name}:")

        # Initialize variables to store results
        accuracies = []

        # Iterate over Stratified K-Folds
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # Initialize and train the model
            model.fit(X_train, y_train)

            # Make predictions
            y_pred = model.predict(X_test)

            # Calculate accuracy and store the result
            accuracy = accuracy_score(y_test, y_pred)
            accuracies.append(accuracy)

        # Calculate and print average accuracy across folds
        avg_accuracy = sum(accuracies) / len(accuracies)
        print(f"Average Accuracy: {avg_accuracy:.4f}")

# Example usage:
compare_models_stratified_kfold(X_train, y_train)