Kelompok 6 (Breast Cancer)

1. Muhammad Rasyid A (5311420053)
2. Muhammad Zidane A (5311421064)
3. Ricko Chandra Saputra (5311421075)


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import accuracy_score

In [3]:
class NB:
    def prediksi(self, X):
        prediksi = [self._prediksi(i) for i in X]
        return np.array(prediksi)

    def _prediksi(self, x):
        posteriors = []
        for idx, nama_kelas in enumerate(self.kelas):
            prior = np.log(self.prior[idx])
            posterior = sum(np.log(self.fungsi_derivatif(idx, x)))
            posterior = prior + posterior
            posteriors.append(posterior)

        return self.kelas[np.argmax(posteriors)]

    def fungsi_derivatif(self, kelas_indeks, x):
        mean = self.mean[kelas_indeks]
        var = self.var[kelas_indeks]
        pembilang = np.exp(-(x - mean) ** 2 / (2 * var))
        penyebut = np.sqrt(2 * np.pi * var)
        return pembilang / penyebut

    def fit(self, X, Y):
        n_baris, n_kolom = X.shape
        self.kelas = np.unique(Y)  # Add this line to initialize the 'kelas' attribute
        jumlah_kelas = len(self.kelas)

        self.mean = np.zeros((jumlah_kelas, n_kolom), dtype=np.float64)
        self.var = np.zeros((jumlah_kelas, n_kolom), dtype=np.float64)
        self.prior = np.zeros(jumlah_kelas, dtype=np.float64)

        for idx, nama_kelas in enumerate(self.kelas):
            X_kelas = X[Y == nama_kelas]
            self.mean[idx, :] = X_kelas.mean(axis=0)
            self.var[idx, :] = X_kelas.var(axis=0)
            self.prior[idx] = X_kelas.shape[0] / n_baris

.

In [4]:

class FeatureSelectionGA:
    def __init__(self, model, feature_names, population_size=100, generations=50, crossover_prob=0.8, mutation_prob=0.5):
        self.model = model
        self.feature_names = feature_names
        self.population_size = population_size
        self.generations = generations
        self.crossover_prob = crossover_prob
        self.mutation_prob = mutation_prob

    def _fitness(self, X_train, X_test, y_train, y_test, selected_features):
        model = self.model
        model.fit(X_train[:, selected_features], y_train)
        y_pred = model.prediksi(X_test[:, selected_features])
        return accuracy_score(y_test, y_pred)

    def _initialize_population(self, num_features):
        return np.random.choice([0, 1], size=(self.population_size, num_features))

    def _crossover(self, parents):
        children = []
        for i in range(0, len(parents), 2):
            parent1, parent2 = parents[i], parents[i + 1]
            if np.random.rand() < self.crossover_prob:
                crossover_points = np.sort(np.random.choice(len(parent1), 2, replace=False))
                crossover_point1, crossover_point2 = crossover_points
                child1 = np.concatenate((parent1[:crossover_point1], parent2[crossover_point1:crossover_point2], parent1[crossover_point2:]))
                child2 = np.concatenate((parent2[:crossover_point1], parent1[crossover_point1:crossover_point2], parent2[crossover_point2:]))
            else:
                child1, child2 = parent1.copy(), parent2.copy()
            children.extend([child1, child2])
        return np.array(children)

    def _mutation(self, children):
        mutation_mask = (np.random.rand(*children.shape) < self.mutation_prob).astype(int)
        return (children + mutation_mask) % 2

    def _select_features(self, X_train, X_test, y_train, y_test, population):
        fitness_values = []
        for features in population:
            fitness_values.append(self._fitness(X_train, X_test, y_train, y_test, np.where(features == 1)[0]))
        return np.array(fitness_values)

    def _roulette_wheel_selection(self, fitness_values):
        total_fitness = np.sum(fitness_values)
        probabilities = fitness_values / total_fitness
        selected_indices = np.random.choice(len(fitness_values), size=self.population_size // 2, p=probabilities)
        return selected_indices

    def optimize(self, X_train, X_test, y_train, y_test):
        num_features = X_train.shape[1]
        population = self._initialize_population(num_features)

        for generation in tqdm(range(self.generations)):
            fitness_values = self._select_features(X_train, X_test, y_train, y_test, population)

            # Roulette wheel selection
            selected_indices = self._roulette_wheel_selection(fitness_values)
            parents = population[selected_indices]

            children = self._crossover(parents)
            children = self._mutation(children)

            population = np.vstack([parents, children])

        best_features = np.where(population[0] == 1)[0]
        selected_feature_names = [self.feature_names[i] for i in best_features]
        return best_features, selected_feature_names



In [5]:
# Load your dataset
df = pd.read_csv('breast_cancer.csv')
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

X = df.iloc[:, 2:].values
y = df.iloc[:, 1].values
feature_names = df.columns[2:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [6]:
display(df.head(10))

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,
5,843786,1,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,...,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244,
6,844359,1,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,...,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368,
7,84458202,1,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,...,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151,
8,844981,1,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,...,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072,
9,84501001,1,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,...,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075,


In [7]:
df.shape

(569, 33)

In [8]:
df.isna().sum()

id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed:

In [9]:
df = df.dropna(axis=1)

In [10]:
df.shape

(569, 32)

In [11]:
df['diagnosis'].value_counts()

0    357
1    212
Name: diagnosis, dtype: int64

.

In [12]:
# Define Naive Bayes model
nb_model = NB()

In [13]:
# Call the fit method to initialize the 'kelas' attribute
nb_model.fit(X_train, y_train)

In [40]:
# Use genetic algorithm for feature selection
feature_selector = FeatureSelectionGA(model=nb_model, feature_names=feature_names)
best_features, selected_feature_names = feature_selector.optimize(X_train, X_test, y_train, y_test)

print("\n")
print("Best Features:", best_features)
print("Selected Feature Names:", selected_feature_names)
print("Number of Best Features:", len(best_features))
print("\n")

100%|██████████| 50/50 [01:06<00:00,  1.33s/it]



Best Features: [ 0  2  4  5  6  7  8 10 12 13 15 16 17 21 23 27 29]
Selected Feature Names: ['radius_mean', 'perimeter_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'radius_se', 'perimeter_se', 'area_se', 'compactness_se', 'concavity_se', 'concave points_se', 'texture_worst', 'area_worst', 'concave points_worst', 'fractal_dimension_worst']
Number of Best Features: 17







In [41]:
# Train Naive Bayes with the best features
nb_model.fit(X_train[:, best_features], y_train)
y_pred = nb_model.prediksi(X_test[:, best_features])
accuracy = accuracy_score(y_test, y_pred)
print(f"With feature selection (Genetic Algoritm), accuracy: {accuracy * 100:.2f}%")


With feature selection (Genetic Algoritm), accuracy: 92.98%


In [42]:
# Train Naive Bayes without feature selection
nb_model_no_fs = NB()
nb_model_no_fs.fit(X_train, y_train)
y_pred_no_fs = nb_model_no_fs.prediksi(X_test)
accuracy_no_fs = accuracy_score(y_test, y_pred_no_fs)
print(f"Without feature selection (Genetic Algoritm), accuracy: {accuracy_no_fs * 100:.2f}%")

Without feature selection (Genetic Algoritm), accuracy: 65.61%


In [43]:
nb_model_with_fs = NB()
nb_model_with_fs.fit(X_train[:, best_features], y_train)

In [44]:
def generate_random_data(num_features):
    return np.random.rand(1, num_features)

In [45]:
num_features = X_train.shape[1]
X_random = generate_random_data(num_features)

print("Random Data:")
print(X_random)
print("Number of Random Data:", len(best_features))

Random Data:
[[0.03729059 0.99807285 0.80903275 0.21845864 0.49481359 0.95834884
  0.79854034 0.18135706 0.73244213 0.63582403 0.19122378 0.16810647
  0.17840463 0.78944188 0.86246393 0.86933712 0.88775004 0.77599801
  0.46924776 0.98140771 0.42781935 0.465088   0.72371511 0.95820603
  0.25931495 0.31502972 0.90743254 0.79437305 0.36355494 0.85636232
  0.25543788]]
Number of Random Data: 17


In [46]:
# Predict the diagnosis using the trained Naive Bayes model with feature selection
diagnosis_prediction = nb_model_with_fs.prediksi(X_random[:, best_features])

  posterior = sum(np.log(self.fungsi_derivatif(idx, x)))


In [47]:
# Map the prediction back to the original labels (if needed)
predicted_diagnosis_label = 'M' if diagnosis_prediction[0] == 1 else 'B'

print("\n")
print("Random Data for Prediction:")
print(X_random)
print("Predicted Diagnosis:", predicted_diagnosis_label)
print("\n")



Random Data for Prediction:
[[0.03729059 0.99807285 0.80903275 0.21845864 0.49481359 0.95834884
  0.79854034 0.18135706 0.73244213 0.63582403 0.19122378 0.16810647
  0.17840463 0.78944188 0.86246393 0.86933712 0.88775004 0.77599801
  0.46924776 0.98140771 0.42781935 0.465088   0.72371511 0.95820603
  0.25931495 0.31502972 0.90743254 0.79437305 0.36355494 0.85636232
  0.25543788]]
Predicted Diagnosis: B


