# IMPORT THE NECCESESARY LIBRARIES

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import BaggingClassifier

# READ THE DATASET

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/KK_ASSIGNMENT/breast-cancer-wisconsin.data', header = None)

In [None]:
dataset.columns = ['Id', 'Clump_thickness', 'Uniformity_cell_size', 'Uniformity_cell_shape', 'Marginal_adhesion', 'Single_e_cell_size', 'Bare_nuclei', 'Bland_chromatin', 'Normal_nucleoli', 'Mitoses', 'Class']

In [None]:
dataset.head()

Unnamed: 0,Id,Clump_thickness,Uniformity_cell_size,Uniformity_cell_shape,Marginal_adhesion,Single_e_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


# CHECKING FOR NULL VALUE

In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Id                     699 non-null    int64 
 1   Clump_thickness        699 non-null    int64 
 2   Uniformity_cell_size   699 non-null    int64 
 3   Uniformity_cell_shape  699 non-null    int64 
 4   Marginal_adhesion      699 non-null    int64 
 5   Single_e_cell_size     699 non-null    int64 
 6   Bare_nuclei            699 non-null    object
 7   Bland_chromatin        699 non-null    int64 
 8   Normal_nucleoli        699 non-null    int64 
 9   Mitoses                699 non-null    int64 
 10  Class                  699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [None]:
for i in dataset.columns:
    print(i, dataset[i].isna().all())

Id False
Clump_thickness False
Uniformity_cell_size False
Uniformity_cell_shape False
Marginal_adhesion False
Single_e_cell_size False
Bare_nuclei False
Bland_chromatin False
Normal_nucleoli False
Mitoses False
Class False


#CHECK BARE_NUCLEI COLUMN
#Berdasarkan jurnal acuan terdapat missing value di kolom Bare_Nuclei yang setelah dicek berisi value '?'

In [None]:
dataset['Bare_nuclei'].unique()

array(['1', '10', '2', '4', '3', '9', '7', '?', '5', '8', '6'],
      dtype=object)

In [None]:
data_fresh = dataset.loc[dataset["Bare_nuclei"] != '?']

In [None]:
data_fresh['Bare_nuclei'].unique()

array(['1', '10', '2', '4', '3', '9', '7', '5', '8', '6'], dtype=object)

In [None]:
data_fresh['Bare_nuclei']=data_fresh['Bare_nuclei'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_fresh['Bare_nuclei']=data_fresh['Bare_nuclei'].astype('int64')


In [None]:
data_fresh.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   Id                     683 non-null    int64
 1   Clump_thickness        683 non-null    int64
 2   Uniformity_cell_size   683 non-null    int64
 3   Uniformity_cell_shape  683 non-null    int64
 4   Marginal_adhesion      683 non-null    int64
 5   Single_e_cell_size     683 non-null    int64
 6   Bare_nuclei            683 non-null    int64
 7   Bland_chromatin        683 non-null    int64
 8   Normal_nucleoli        683 non-null    int64
 9   Mitoses                683 non-null    int64
 10  Class                  683 non-null    int64
dtypes: int64(11)
memory usage: 64.0 KB


# SPLIT THE DATA INTO X_train, X_test, y_train, y_test

In [None]:
X = data_fresh.iloc[:, 1:-1]

In [None]:
X

Unnamed: 0,Clump_thickness,Uniformity_cell_size,Uniformity_cell_shape,Marginal_adhesion,Single_e_cell_size,Bare_nuclei,Bland_chromatin,Normal_nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1
...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1
695,2,1,1,1,2,1,1,1,1
696,5,10,10,3,7,3,8,10,2
697,4,8,6,4,3,4,10,6,1


In [None]:
new_X = X.values

In [None]:
new_X

array([[ 5,  1,  1, ...,  3,  1,  1],
       [ 5,  4,  4, ...,  3,  2,  1],
       [ 3,  1,  1, ...,  3,  1,  1],
       ...,
       [ 5, 10, 10, ...,  8, 10,  2],
       [ 4,  8,  6, ..., 10,  6,  1],
       [ 4,  8,  8, ..., 10,  4,  1]])

In [None]:
y = data_fresh.iloc[:,-1]

In [None]:
new_y=y.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(new_X,new_y, test_size=0.3, random_state=123)

In [None]:
X_train

array([[ 5, 10, 10, ...,  8, 10,  2],
       [ 3,  1,  1, ...,  2,  1,  1],
       [ 6,  3,  2, ...,  4,  1,  1],
       ...,
       [ 3,  1,  1, ...,  3,  1,  1],
       [ 5,  3,  4, ...,  3,  1,  1],
       [ 3,  1,  1, ...,  1,  1,  1]])

In [None]:
model_1 = DecisionTreeClassifier(criterion='entropy')

# Particle Swarm Optimization Implementation

In [None]:
!pip install pyswarms


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pyswarms as ps

# Define objective function

def f_per_particle(m, alpha):
    
    """Computes for the objective function per particle

    Inputs
    ------
    m : numpy.ndarray
        Binary mask that can be obtained from BinaryPSO, will
        be used to mask features.
    alpha: float (default is 0.5)
        Constant weight for trading-off classifier performance
        and number of features

    Returns
    -------
    numpy.ndarray
        Computed objective function
    """
    total_features = X_train.shape[1]
    
    # Get the subset of the features from the binary mask
    if np.count_nonzero(m) == 0:
        X_train_subset = X_train
        X_test_subset = X_test
    else:
        X_train_subset = X_train[:,m==1]
        X_test_subset = X_test[:,m==1]
        
    # Perform classification and store performance in P
    
    model = model_1
    
    model.fit(X_train_subset, y_train)
    y_pred = model.predict(X_test_subset)
    
    P = accuracy_score(y_pred, y_test)
    
    # Compute for the objective function
    j = (alpha * (1.0 - P)
        + (1.0 - alpha) * (1 - (X_train_subset.shape[1] / total_features)))

    return j

In [None]:
def f(x, alpha=0.92):
    """Higher-level method to do classification in the
    whole swarm.

    Inputs
    ------
    x: numpy.ndarray of shape (n_particles, dimensions)
        The swarm that will perform the search

    Returns
    -------
    numpy.ndarray of shape (n_particles, )
        The computed loss for each particle
    """
    n_particles = x.shape[0]
    j = [f_per_particle(x[i], alpha) for i in range(n_particles)]
    return np.array(j)

In [None]:
# Initialize swarm, arbitrary
options = {'c1': 0.7, 'c2': 0.7, 'w':0.72, 'k': 50, 'p':2}

# Call instance of PSO
dimensions = 9 # dimensions should be the number of features
optimizer = ps.discrete.BinaryPSO(n_particles=50, dimensions=dimensions, options=options)
# Perform optimization
cost, pos = optimizer.optimize(f, iters=100, verbose=2)

2022-12-14 13:31:30,000 - pyswarms.discrete.binary - INFO - Optimize for 100 iters with {'c1': 0.7, 'c2': 0.7, 'w': 0.72, 'k': 50, 'p': 2}
pyswarms.discrete.binary: 100%|██████████|100/100, best_cost=0.0224
2022-12-14 13:31:35,769 - pyswarms.discrete.binary - INFO - Optimization finished | best cost: 0.022352303523035244, best pos: [1 1 1 1 1 0 1 1 1]


In [None]:
optimizer.reset()

# Featue Selected by Particle Swarm Optimization

In [None]:
columns_just_pso = list()
print("Columns selected with just PSO:\n")

for x, y in zip(X.columns, pos):
    
    if y == 1:
        print(x)
        columns_just_pso.append(x)

Columns selected with just PSO:

Clump_thickness
Uniformity_cell_size
Uniformity_cell_shape
Marginal_adhesion
Single_e_cell_size
Bland_chromatin
Normal_nucleoli
Mitoses


# Make and split the data use the features that PSO selected

In [None]:
X_pso = data_fresh[['Clump_thickness','Uniformity_cell_size','Uniformity_cell_shape','Marginal_adhesion','Single_e_cell_size','Bland_chromatin','Normal_nucleoli','Mitoses']]

In [None]:
X_pso

Unnamed: 0,Clump_thickness,Uniformity_cell_size,Uniformity_cell_shape,Marginal_adhesion,Single_e_cell_size,Bland_chromatin,Normal_nucleoli,Mitoses
0,5,1,1,1,2,3,1,1
1,5,4,4,5,7,3,2,1
2,3,1,1,1,2,3,1,1
3,6,8,8,1,3,3,7,1
4,4,1,1,3,2,3,1,1
...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,1,1,1
695,2,1,1,1,2,1,1,1
696,5,10,10,3,7,8,10,2
697,4,8,6,4,3,10,6,1


In [None]:
y_pso = data_fresh[["Class"]]

In [None]:
y_pso

Unnamed: 0,Class
0,2
1,2
2,2
3,2
4,2
...,...
694,2
695,2
696,4
697,4


In [None]:
X_train_pso, X_test_pso, y_train_pso, y_test_pso = train_test_split(X_pso, y_pso, test_size=0.2, random_state=1)

In [None]:
X_train_pso.head()

Unnamed: 0,Clump_thickness,Uniformity_cell_size,Uniformity_cell_shape,Marginal_adhesion,Single_e_cell_size,Bland_chromatin,Normal_nucleoli,Mitoses
575,5,1,2,1,2,3,1,1
440,10,4,3,10,4,10,1,1
123,5,3,5,1,8,5,3,1
51,5,3,3,4,2,3,4,1
318,1,1,1,1,5,3,1,1


# CHECK THE ACCURACY FOR DECISION TREE

In [None]:
model_1.fit(X_train_pso, y_train_pso)

DecisionTreeClassifier(criterion='entropy')

In [None]:
Y_pred = model_1.predict(X_test_pso) 

In [None]:
score_model_1 = accuracy_score(y_test_pso, Y_pred) 
recall_model_1 = recall_score(y_test_pso, Y_pred,average='macro')
precision_model_1 = precision_score(y_test_pso, Y_pred,average='macro')
f1_score_model_1 = f1_score(y_test_pso, Y_pred,average='macro')
print(f"\naccuracy value ={score_model_1:.2%}")
print(f"\nrecall value ={recall_model_1:.2%}")
print(f"\nprecision value ={precision_model_1:.2%}")
print(f"\nf1_score value ={f1_score_model_1:.2%}")



accuracy value =96.35%

recall value =95.70%

precision value =96.18%

f1_score value =95.93%


In [None]:
confusion_matrix(y_test_pso, Y_pred)

array([[88,  2],
       [ 3, 44]])

# CHECK THE ACCURACY FOR BAGGING DECISION TREE

In [None]:
bgclassifier = BaggingClassifier(base_estimator=model_1, n_estimators=100,
                                 random_state=3)

In [None]:
bgclassifier.fit(X_train_pso, y_train_pso)

  y = column_or_1d(y, warn=True)


BaggingClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy'),
                  n_estimators=100, random_state=3)

In [None]:
Y_pred_2 = bgclassifier.predict(X_test_pso) 
score_model_2 = accuracy_score(y_test_pso, Y_pred_2)
recall_model_2 = recall_score(y_test_pso, Y_pred_2,average='macro')
precision_model_2 = precision_score(y_test_pso, Y_pred_2,average='macro')
f1_score_model_2 = f1_score(y_test_pso, Y_pred_2,average='macro') 
print(f"\naccuracy value ={score_model_2:.2%}")
print(f"\nrecall value ={recall_model_2:.2%}")
print(f"\nprecision value ={precision_model_2:.2%}")
print(f"\nf1_score value ={f1_score_model_2:.2%}")


accuracy value =97.81%

recall value =96.81%

precision value =98.39%

f1_score value =97.53%


In [None]:
confusion_matrix(y_test_pso, Y_pred_2)

array([[90,  0],
       [ 3, 44]])

# RESULT TEST FOR THE ACCURACY, RECALL, PRECISION, F1 SCORE

In [None]:
data_test = {'Name Algorithm': [ ' DECISION TREE ', 'BAGGING CLASSIFIER'],
        'ACCURACY': [score_model_1,score_model_2 ],
        'RECALL' : [recall_model_1,recall_model_2],
        'PRECISION' : [precision_model_1,precision_model_2],
        'F1 SCORE' : [f1_score_model_1,f1_score_model_2]}

In [None]:
RESULT_TEST =pd.DataFrame(data_test)
pd.options.display.float_format = '{:,.2%}'.format

In [None]:
RESULT_TEST

Unnamed: 0,Name Algorithm,ACCURACY,RECALL,PRECISION,F1 SCORE
0,DECISION TREE,96.35%,95.70%,96.18%,95.93%
1,BAGGING CLASSIFIER,97.81%,96.81%,98.39%,97.53%


In [None]:
data_test = {'Name Algorithm': [ 'DECISION TREE','DECISION TREE+ PSO ','BANGGING  CLASSIFIER','BAGGING CLASSIFIER+PSO'],
        'ACCURACY': [0.9463,0.9635,0.9707,0.9781]}     

In [None]:
RESULT_TEST =pd.DataFrame(data_test)
pd.options.display.float_format = '{:,.2%}'.format

In [None]:
RESULT_TEST

Unnamed: 0,Name Algorithm,ACCURACY
0,DECISION TREE,94.63%
1,DECISION TREE+ PSO,96.35%
2,BANGGING CLASSIFIER,97.07%
3,BAGGING CLASSIFIER+PSO,97.81%
