In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
#         print(os.path.join(dirname, filename))
        pass

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install joblib

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
import time
import joblib
import cv2

In [None]:
import os
import cv2
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from skimage.feature import hog

In [None]:
# Define the dataset directory and classes
dataset_dir = "/kaggle/input/ct-kidney-dataset-normal-cyst-tumor-and-stone/CT-KIDNEY-DATASET-Normal-Cyst-Tumor-Stone/CT-KIDNEY-DATASET-Normal-Cyst-Tumor-Stone"
classes = ['Cyst', 'Normal', 'Stone', 'Tumor']

In [None]:
# Function to load images and extract features
def load_images_and_features(dataset_dir, classes):
    images = []
    labels = []
    for label, class_name in enumerate(classes):
        class_dir = os.path.join(dataset_dir, class_name)
        for filename in os.listdir(class_dir):
            img_path = os.path.join(class_dir, filename)
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            img = cv2.resize(img, (128, 128))  # Resize for consistency

            # Extract HOG features
            features, _ = hog(img, orientations=8, pixels_per_cell=(16, 16),
                  cells_per_block=(1, 1), visualize=True)

            
            images.append(features)
            labels.append(label)

    return np.array(images), np.array(labels)

In [None]:
# Load the dataset
X, y = load_images_and_features(dataset_dir, classes)

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
# Expanding the classifiers list
classifiers = {
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(kernel='linear', probability=True),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100),
    "AdaBoost": AdaBoostClassifier(n_estimators=100),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100)
}

In [None]:
# DataFrame to store results
results_df = pd.DataFrame(columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score", "Training Time (s)"])

# Train, evaluate each classifier, and save results
for name, clf in classifiers.items():
    start_time = time.time()
    clf.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    print(f"Total Training time for {name}: ", training_time)

    y_pred = clf.predict(X_test)
    
    # Save the trained model
    model_filename = f"{name.replace(' ', '_')}_model.joblib"
    joblib.dump(clf, model_filename)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    precision = report['macro avg']['precision']
    recall = report['macro avg']['recall']
    f1_score = report['macro avg']['f1-score']

    # Create a temporary DataFrame and concatenate
    temp_df = pd.DataFrame({
        "Model": [name],
        "Accuracy": [accuracy],
        "Precision": [precision],
        "Recall": [recall],
        "F1-Score": [f1_score],
        "Training Time (s)": [training_time]
    })
    results_df = pd.concat([results_df, temp_df], ignore_index=True)

    # Print report
    print(f"{name} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1_score}, Training Time: {training_time:.2f} seconds")

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Cyst', 'Normal', 'Stone', 'Tumor'],yticklabels=['Cyst', 'Normal', 'Stone', 'Tumor'])
    plt.title(f'Confusion Matrix for {name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.savefig(f"confusion_matrix_{name}.png")
    plt.close()

# Save results to CSV
results_df.to_csv('model_evaluation_results.csv', index=False)

## Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV


In [None]:
# Hyperparameter grids for different classifiers
param_grids = {
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    "SVM": {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    },
    "K-Nearest Neighbors": {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    },
    "Naive Bayes": {
        'var_smoothing': np.logspace(0, -9, num=4)
    },
    "Decision Tree": {
        'max_depth': [None, 10, 20, 30, 40],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    "Logistic Regression": {
        'C': [0.1, 1, 10, 100],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'max_iter': [100, 200, 300]
    },
    "Gradient Boosting": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    "AdaBoost": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0]
    },
    "Extra Trees": {
        'n_estimators': [50, 100, 200],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
}


In [None]:
# DataFrame to store results
results_df = pd.DataFrame(columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score", "Training Time (s)"])

# Train, evaluate each classifier with hyperparameter tuning, and save results
for name, clf in classifiers.items():
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grids[name], cv=3, n_jobs=-1, verbose=2)
    
    start_time = time.time()
    grid_search.fit(X_train, y_train)
    best_clf = grid_search.best_estimator_
    training_time = time.time() - start_time
    
    print(f"Total Training time for {name}_Hyperparameter_Tuning: {training_time:.2f} seconds")

    y_pred = best_clf.predict(X_test)
    
    # Save the trained model
    model_filename = f"{name.replace(' ', '_')}_model_Hyperparameter_Tuning.joblib"
    joblib.dump(best_clf, model_filename)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    precision = report['macro avg']['precision']
    recall = report['macro avg']['recall']
    f1_score = report['macro avg']['f1-score']

    # Create a temporary DataFrame and concatenate
    temp_df = pd.DataFrame({
        "Model": [name+"_Hyperparameter_Tuning"],
        "Accuracy": [accuracy],
        "Precision": [precision],
        "Recall": [recall],
        "F1-Score": [f1_score],
        "Training Time (s)": [training_time]
    })
    results_df = pd.concat([results_df, temp_df], ignore_index=True)

    # Print report
    print(f"{name}_Hyperparameter_Tuning - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1_score}, Training Time: {training_time:.2f} seconds")

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Cyst', 'Normal', 'Stone', 'Tumor'],yticklabels=['Cyst', 'Normal', 'Stone', 'Tumor'])
    plt.title(f'Confusion Matrix for {name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.savefig(f"confusion_matrix_{name}_Hyperparameter_Tuning.png")
    plt.close()

# Save results to CSV
results_df.to_csv('model_evaluation_results.csv', index=False)
    

In [None]:
# from sklearn.ensemble import VotingClassifier
# import joblib

# # List of your model names
# model_names = ["Random_Forest", "SVM", "K-Nearest_Neighbors", "Naive_Bayes", "Decision_Tree", "Logistic_Regression", "Gradient_Boosting", "AdaBoost", "Extra_Trees"]

# # Load the trained models
# classifiers = {name: joblib.load(f"{name}_model.joblib") for name in model_names}

# # Create ensemble classifiers
# classifiers_for_voting = [(name, clf) for name, clf in classifiers.items()]

# # Hard Voting Classifier
# hard_voting_clf = VotingClassifier(estimators=classifiers_for_voting, voting='hard')
# hard_voting_clf.fit(X_train, y_train)
# y_pred_hard = hard_voting_clf.predict(X_test)

# # Evaluate Hard Voting Classifier
# hard_accuracy = accuracy_score(y_test, y_pred_hard)
# print("Hard Voting Classifier Accuracy:", hard_accuracy)

# # Soft Voting Classifier
# # Note: Ensure all classifiers support predict_proba for soft voting
# soft_voting_clf = VotingClassifier(estimators=classifiers_for_voting, voting='soft')
# soft_voting_clf.fit(X_train, y_train)
# y_pred_soft = soft_voting_clf.predict(X_test)

# # Evaluate Soft Voting Classifier
# soft_accuracy = accuracy_score(y_test, y_pred_soft)
# print("Soft Voting Classifier Accuracy:", soft_accuracy)


In [None]:
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import VotingClassifier
import matplotlib.pyplot as plt
import seaborn as sns


# Load the trained models
model_names = ["Random_Forest", "SVM", "K-Nearest_Neighbors", "Naive_Bayes", "Decision_Tree", "Logistic_Regression", "Gradient_Boosting", "AdaBoost", "Extra_Trees"]
classifiers = {name: joblib.load(f"{name}_model.joblib") for name in model_names}

# Create ensemble classifiers
classifiers_for_voting = [(name, clf) for name, clf in classifiers.items()]

# Hard Voting Classifier
hard_voting_clf = VotingClassifier(estimators=classifiers_for_voting, voting='hard')
hard_voting_clf.fit(X_train, y_train)
y_pred_hard = hard_voting_clf.predict(X_test)

# Soft Voting Classifier
soft_voting_clf = VotingClassifier(estimators=classifiers_for_voting, voting='soft')
soft_voting_clf.fit(X_train, y_train)
y_pred_soft = soft_voting_clf.predict(X_test)

# DataFrame to store results
results_df = pd.DataFrame(columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score", "Training Time (s)"])

# Store results of hard voting
hard_report = classification_report(y_test, y_pred_hard, output_dict=True)
hard_df = pd.DataFrame({
    "Model": ["Hard Voting"],
    "Accuracy": [hard_report['accuracy']],
    "Precision": [hard_report['macro avg']['precision']],
    "Recall": [hard_report['macro avg']['recall']],
    "F1-Score": [hard_report['macro avg']['f1-score']],
    "Training Time (s)": [None]
})
results_df = pd.concat([results_df, hard_df], ignore_index=True)

# Store results of soft voting
soft_report = classification_report(y_test, y_pred_soft, output_dict=True)
soft_df = pd.DataFrame({
    "Model": ["Soft Voting"],
    "Accuracy": [soft_report['accuracy']],
    "Precision": [soft_report['macro avg']['precision']],
    "Recall": [soft_report['macro avg']['recall']],
    "F1-Score": [soft_report['macro avg']['f1-score']],
    "Training Time (s)": [None]
})


results_df = pd.concat([results_df, soft_df], ignore_index=True)

# Save results to CSV
results_df.to_csv('voting_classifier_results.csv', index=False)

# Optionally, print the results
print(results_df)


In [None]:
# Save metrics to CSV file
csv_file = 'model_evaluation_results.csv'
if not os.path.exists(csv_file):
    results_df.to_csv(csv_file, index=False)
else:
    pd.concat([pd.read_csv(csv_file), results_df]).to_csv(csv_file, index=False)
    

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier

# Define base learners
base_learners = [
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('knn', KNeighborsClassifier()),
    ('svc', SVC(probability=True, random_state=42))
]

# Define meta-learner
meta_learner = ExtraTreesClassifier(n_estimators=100)

# Stacking Classifier
stacking_clf = StackingClassifier(estimators=base_learners, final_estimator=meta_learner, cv=5)
stacking_clf.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = stacking_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Stacking Model Accuracy: {accuracy:.4f}')
# Store results of soft voting
stacking_report = classification_report(y_test, y_pred, output_dict=True)
stacking_df = pd.DataFrame({
    "Model": ["Stacking"],
    "Accuracy": [stacking_report['accuracy']],
    "Precision": [stacking_report['macro avg']['precision']],
    "Recall": [stacking_report['macro avg']['recall']],
    "F1-Score": [stacking_report['macro avg']['f1-score']],
    "Training Time (s)": [None]
})

# Save metrics to CSV file
csv_file = 'model_evaluation_results.csv'
if not os.path.exists(csv_file):
    stacking_df.to_csv(csv_file, index=False)
else:
    pd.concat([pd.read_csv(csv_file), stacking_df]).to_csv(csv_file, index=False)
    