In [16]:
import pandas as pd

data = pd.read_csv('./breast-cancer.csv')


print(data)
print(data.info())


           id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0      842302         M        17.99         10.38          122.80     1001.0   
1      842517         M        20.57         17.77          132.90     1326.0   
2    84300903         M        19.69         21.25          130.00     1203.0   
3    84348301         M        11.42         20.38           77.58      386.1   
4    84358402         M        20.29         14.34          135.10     1297.0   
..        ...       ...          ...           ...             ...        ...   
564    926424         M        21.56         22.39          142.00     1479.0   
565    926682         M        20.13         28.25          131.20     1261.0   
566    926954         M        16.60         28.08          108.30      858.1   
567    927241         M        20.60         29.33          140.10     1265.0   
568     92751         B         7.76         24.54           47.92      181.0   

     smoothness_mean  compa

In [17]:
# agar id colum coumn exit karta hai to remove kardo
if 'id' in data.columns:
    data.drop(columns=['id'], inplace=True)

# 'diagnosis' column ko  binary binary mein convert: M = 1, B = 0
data['diagnosis'] = data['diagnosis'].apply(lambda x: 1 if x == 'M' else 0)

# checking remainsg missing value
missing_values = data.isnull().sum()

missing_values


diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Spliting data into features and target
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((455, 30), (114, 30), (455,), (114,))

In [19]:
#KNN DATA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score


knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)

knn_results = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1_score': f1_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, knn_model.predict_proba(X_test)[:, 1]),
    'cross_val_score': cross_val_score(knn_model, X, y, cv=5).mean()
}

knn_results_df = pd.DataFrame([knn_results], index=['KNN'])
print(knn_results_df)




     accuracy  precision    recall  f1_score   roc_auc  cross_val_score
KNN  0.947368   0.957447  0.918367    0.9375  0.985871         0.927946


In [20]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score

decision_tree_model = DecisionTreeClassifier(random_state=70)
decision_tree_model.fit(X_train, y_train)
y_pred = decision_tree_model.predict(X_test)

decision_tree_results = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1_score': f1_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, decision_tree_model.predict_proba(X_test)[:, 1]),
    'cross_val_score': cross_val_score(decision_tree_model, X, y, cv=5).mean()
}

decision_tree_results_df = pd.DataFrame([decision_tree_results], index=['Decision Tree'])
print(decision_tree_results_df)




               accuracy  precision    recall  f1_score   roc_auc  \
Decision Tree  0.938596       0.92  0.938776  0.929293  0.938619   

               cross_val_score  
Decision Tree         0.919143  


In [21]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score

random_forest_model = RandomForestClassifier(random_state=70)
random_forest_model.fit(X_train, y_train)
y_pred = random_forest_model.predict(X_test)

random_forest_results = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1_score': f1_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, random_forest_model.predict_proba(X_test)[:, 1]),
    'cross_val_score': cross_val_score(random_forest_model, X, y, cv=5).mean()
}

random_forest_results_df = pd.DataFrame([random_forest_results], index=['Random Forest'])
print(random_forest_results_df)




               accuracy  precision    recall  f1_score   roc_auc  \
Random Forest   0.95614   0.958333  0.938776  0.948454  0.993878   

               cross_val_score  
Random Forest         0.963127  


In [22]:
#Cascading Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score
import numpy as np


knn_model = KNeighborsClassifier()


knn_model.fit(X_train, y_train)
knn_train_preds = knn_model.predict(X_train)
knn_test_preds = knn_model.predict(X_test)

# KNN predecition ko as a feature add kiya hai
X_train_with_knn = np.hstack((X_train, knn_train_preds.reshape(-1, 1)))
X_test_with_knn = np.hstack((X_test, knn_test_preds.reshape(-1, 1)))

# Random Forest model training
random_forest_model = RandomForestClassifier(random_state=70)
random_forest_model.fit(X_train_with_knn, y_train)

# Making predictions with Random Forest model
y_pred = random_forest_model.predict(X_test_with_knn)

cascading_results = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred),
    'recall': recall_score(y_test, y_pred),
    'f1_score': f1_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, random_forest_model.predict_proba(X_test_with_knn)[:, 1]),
    'cross_val_score': cross_val_score(random_forest_model, np.hstack((X, knn_model.predict(X).reshape(-1, 1))), y, cv=5).mean()
}

cascading_results_df = pd.DataFrame([cascading_results], index=['Cascading Classifier'])
print(cascading_results_df)




                      accuracy  precision    recall  f1_score   roc_auc  \
Cascading Classifier  0.947368   0.957447  0.918367    0.9375  0.987755   

                      cross_val_score  
Cascading Classifier         0.952569  
