<a href="https://colab.research.google.com/github/PrakashDSdeveloper/Adenocarcinoma_Cancer_Image_Classification_using-MLops-and-DVC/blob/main/other%20ways%20to%20improve.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install shap


Collecting shap
  Downloading shap-0.44.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (535 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m535.7/535.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.7 (from shap)
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.44.1 slicer-0.0.7


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
import matplotlib.pyplot as plt
import shap

In [8]:
# Load dataset
data = pd.read_csv('/content/creditcard.csv')

In [10]:
data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        1
V6        1
V7        1
V8        1
V9        1
V10       1
V11       1
V12       1
V13       1
V14       1
V15       1
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [11]:
data.dropna(inplace = True)

In [12]:
X = data.drop('Class', axis=1)
y = data['Class']

In [None]:
# Hyperparameter Tuning
# Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='f1', n_jobs=-1)
grid_search_rf.fit(X, y)
best_params_rf = grid_search_rf.best_params_

In [None]:
# Gradient Boosting
param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 10]
}
gb = GradientBoostingClassifier(random_state=42)
grid_search_gb = GridSearchCV(gb, param_grid_gb, cv=5, scoring='f1', n_jobs=-1)
grid_search_gb.fit(X, y)
best_params_gb = grid_search_gb.best_params_

In [None]:
# Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Sampling Techniques
# SMOTE
smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X, y)

# ADASYN
adasyn = ADASYN(random_state=42)
X_resampled_adasyn, y_resampled_adasyn = adasyn.fit_resample(X, y)

# Borderline-SMOTE
borderline_smote = BorderlineSMOTE(random_state=42)
X_resampled_borderline_smote, y_resampled_borderline_smote = borderline_smote.fit_resample(X, y)

# Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled_ros, y_resampled_ros = ros.fit_resample(X, y)

In [None]:
# Model Evaluation Metrics
# Random Forest with best parameters
rf_clf = RandomForestClassifier(**best_params_rf, random_state=42)
scores_rf = cross_val_score(rf_clf, X_resampled_smote, y_resampled_smote, cv=skf, scoring='f1')
print("Random Forest - SMOTE:")
print("Cross-Validation F1-score:", scores_rf.mean())

# Gradient Boosting with best parameters
gb_clf = GradientBoostingClassifier(**best_params_gb, random_state=42)
scores_gb = cross_val_score(gb_clf, X_resampled_smote, y_resampled_smote, cv=skf, scoring='f1')
print("Gradient Boosting - SMOTE:")
print("Cross-Validation F1-score:", scores_gb.mean())


# Ensemble Methods - Balanced Random Forest
brf_clf = BalancedRandomForestClassifier(random_state=42)
scores_brf = cross_val_score(brf_clf, X_resampled_ros, y_resampled_ros, cv=skf, scoring='f1')
print("Balanced Random Forest - Random Oversampling:")
print("Cross-Validation F1-score:", scores_brf.mean())

# Ensemble Methods - EasyEnsemble
ee_clf = EasyEnsembleClassifier(random_state=42)
scores_ee = cross_val_score(ee_clf, X, y, cv=skf, scoring='f1')
print("EasyEnsemble:")
print("Cross-Validation F1-score:", scores_ee.mean())

In [None]:

# Model Interpretability - SHAP
explainer = shap.TreeExplainer(rf_clf)
shap_values = explainer.shap_values(X_resampled_smote)

# Plot SHAP summary plot
shap.summary_plot(shap_values, X_resampled_smote, plot_type="bar")