## Author: Swaroop Srisailam

In [61]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [62]:
df = pd.DataFrame(load_breast_cancer()['data'], 
 columns=load_breast_cancer()['feature_names']) 
df['y'] = load_breast_cancer()['target']

In [63]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,y
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [64]:
df.shape

(569, 31)

### Checking if there any missing values in the data

In [65]:
missing_values = df.isnull().sum()
if missing_values.any():
    print("Some columns have missing values.")
    print(missing_values)
else:
    print("No columns have missing values.")

No columns have missing values.


### checking if there are any catogorical columns

In [93]:
data_types = df.dtypes

categorical_columns = data_types[data_types == 'object'].index.tolist()

if categorical_columns:
    print("Categorical columns found:")
    print(categorical_columns)
else:
    print("No categorical columns found.")


No categorical columns found.


### checking if there is any class imbalance in the data

In [66]:
class_distribution = df['y'].value_counts()

# Check if there is a class imbalance
if len(class_distribution) == 2:
    class_0_count = class_distribution[0]
    class_1_count = class_distribution[1]
    total_samples = len(df)

    # Calculate the percentage of each class
    class_0_percentage = (class_0_count / total_samples) * 100
    class_1_percentage = (class_1_count / total_samples) * 100

    print("Class 0 count:", class_0_count)
    print("Class 1 count:", class_1_count)

Class 0 count: 212
Class 1 count: 357


In [67]:
X = df.drop('y', axis=1) 
y = df['y']

### oversampling of the data to avoid class imbalance

In [69]:
smote = SMOTE(random_state=42)

X_resampled, y_resampled = smote.fit_resample(X, y)

class_distribution = pd.Series(y_resampled).value_counts()
print("Class distribution after SMOTE:")
print(class_distribution)

Class distribution after SMOTE:
y
0    357
1    357
Name: count, dtype: int64


In [70]:
X_resampled.shape

(714, 30)

### Feature Scaling

In [100]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X_resampled)

### Spliting the data

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_resampled, test_size=0.2, random_state=42)

## Decision Tree

In [102]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)
decision_tree_accuracy = decision_tree.score(X_test, y_test)

## Random Forest

In [103]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, y_train)
random_forest_accuracy = random_forest.score(X_test, y_test)

## Gradient Boosting Method

In [104]:
from sklearn.ensemble import GradientBoostingClassifier

gradient_boosting = GradientBoostingClassifier(random_state=42)
gradient_boosting.fit(X_train, y_train)
gradient_boosting_accuracy = gradient_boosting.score(X_test, y_test)

## K-Nearest Neighbour

In [105]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)
knn_accuracy = knn.score(X_test, y_test)

In [109]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 50, 100],
    'min_samples_leaf': [10, 20, 50]
}

grid_search = GridSearchCV(estimator=decision_tree, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)

best_decision_tree = grid_search.best_estimator_
best_decision_tree_accuracy = best_decision_tree.score(X_test, y_test)

In [111]:
results = pd.DataFrame({
    'Model': ['Decision Tree', 'Random Forest', 'Gradient Boosting', 'K-Nearest Neighbors (KNN)', 'Decision Tree(fine tuned)'],
    'Accuracy': [decision_tree_accuracy, random_forest_accuracy, gradient_boosting_accuracy, knn_accuracy, best_decision_tree_accuracy]
})

print(results)

                       Model  Accuracy
0              Decision Tree  0.923077
1              Random Forest  0.972028
2          Gradient Boosting  0.972028
3  K-Nearest Neighbors (KNN)  0.937063
4  Decision Tree(fine tuned)  0.944056
