# Import Library

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import r2_score, f1_score, roc_auc_score, accuracy_score, recall_score, precision_score, confusion_matrix, classification_report, mean_squared_error, root_mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Read Data

In [None]:
data = pd.read_csv("Data/Breast_Cancer_Classification.csv")
data

# Simple Data Cleaning

In [None]:
data = data.drop(columns=["id", "Unnamed: 32"])
data

#### Drop 2 kolom yang tidak penting sama sekali yaitu hanya sebuah ID dan Unnamed Columns

In [None]:
data_out = data.pop("diagnosis")
data["diagnosis"] = data_out
data

#### Pindahin data Label / Target ke paling akhir / ujung biar memudahkan untuk melakukan iterasi for each nantinya ataupun ketika ingin memisahkan data x dan y.

# EDA

In [None]:
data.columns

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.count()

In [None]:
data.sum()

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
for column in data.columns[:-1]:
    if (data[column].dtype != "object"):
        sns.histplot(data=data, x=column)
        plt.show()

In [None]:
for column in data.columns[:-1]:
    if (data[column].dtype != "object"):
        sns.boxplot(data=data, x=column)
        plt.show()

In [None]:
for column in data.columns[:-1]:
    if (data[column].dtype != "object"):
        sns.scatterplot(data=data, x=column, y="diagnosis")
        plt.show()

In [None]:
for column in data.columns[:-1]:
    if (data[column].dtype != "object"):
        print (f"{column}: Skewness: {data[column].skew()}, Kurtosis: {data[column].kurt()}")

#### Hasil EDA

1. Hampir semua fitur yang ada berdistribusi tidak normal karena hasil nilai skewness dan gambar menunjukkan bahwa mereka cenderung mengarah ke "Right Skewed"

2. Nilai kurtosis yang mengukur seberapa tinggi dan lebar ekor distribusi memberi gambaran kepekatan data di sekitar mean dan seberapa jauh data menyebar hingga ke ekor. Dengan nilai kurtosis yang acak tersebut artinya ada data yang menyebar rata, tapi ada juga nilai kurtosis tinggi artinya terdapat outliers.

- Nilai Skewness yang bagus -> Mendekati 0 artinya Distribusi Normal

- Nilai Kurtosis yang bagus -> Mendekati 3 artinya distribusi normal

Hal ini menyebabkan beberapa hal
- Fill NA dari data numeric akan menggunakan Median -> Karena data tidak berdistribusi secara normal, jadi lebih baik menggunakan nilai Median dibandingkan Mean (rata-rata)
- Fill NA dari data object akan menggunakan Modus (mode)

# Data Preprocessing

In [None]:
data.isna().sum()

#### Anggep aja ada NA yaa, jadi aku lanjutin preprocessing-nya

In [None]:
data.head()

In [None]:
missing_values = data.isna().sum() / len(data)
missing_values

### Drop NA

In [None]:
for key, value in missing_values.items():
    if (value > 0.5):
        data = data.drop(columns=key)

### Fill NA

In [None]:
for column in data.columns:
    if (data[column].isna().any()):
        if (data[column].dtype == "object"):
            data[column] = data[column].fillna(data[column].mode()[0])
        else:
            data[column] = data[column].fillna(data[column].median())

### Encoding (Categorical -> Numerical Data Types)

In [None]:
encoder = LabelEncoder()
encoded = {}

for column in data.columns:
    if (data[column].dtype == "object"):
        data[column] = encoder.fit_transform(data[column])
        encoded[column] = {i:class_name for i, class_name in enumerate(encoder.classes_)}

In [None]:
encoded

### Analisis Korelasi Fitur

In [None]:
sns.heatmap(data=data[:-1].corr(), annot=True, cmap="coolwarm")

In [None]:
# sns.pairplot(data=data, hue="diagnosis")

### Splitting Data

In [None]:
x = data[data.columns[:-1]]
y = data[data.columns[-1]]

In [None]:
x

In [None]:
y

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
def correlation(data_df, threshold):
    corr_col = set()
    corr_matrix = data_df.corr()
    print (corr_matrix)
    for i in range(len(corr_matrix.columns)):
        for j in range (i+1, len(corr_matrix.columns)):
            if (np.abs(corr_matrix.iloc[(i, j)]) > threshold):
                corr_col.add(corr_matrix.columns[j])
    
    return corr_col

In [None]:
corr_col = correlation(x_train, 0.9)
print (corr_col)
print (len(corr_col))

In [None]:
x_train = x_train.drop(columns=corr_col, axis=1)
x_test = x_test.drop(columns=corr_col, axis=1)

### Scaling

In [None]:
scaler_x_minmax = MinMaxScaler()
scaler_x_standard = StandardScaler()

In [None]:
def Scaling (x1, x2, scaler_x):
    x1 = scaler_x.fit_transform(x1)
    x2 = scaler_x.transform(x2)

    return x1, x2, scaler_x

In [None]:
x_train_1, x_test_1, scaler_x_minmax = Scaling(x_train, x_test, scaler_x_minmax)

In [None]:
x_train_2, x_test_2, scaler_x_standard = Scaling(x_train, x_test, scaler_x_standard)

### Prediction Scaling (MinMaxScaler)

In [None]:
model = GaussianNB()
model.fit(X=x_train_1, y=y_train)

y_pred = model.predict(x_test_1)

mse = mean_squared_error(y_pred, y_test)
rmse = root_mean_squared_error(y_pred, y_test)
roc = roc_auc_score(y_pred, y_test)

accuracy = accuracy_score(y_pred, y_test)
precision = precision_score(y_pred, y_test)
recall = recall_score(y_pred, y_test)

conf = confusion_matrix(y_pred, y_test)
report = classification_report(y_pred, y_test)

print (f"MSE {mse}")
print (f"RMSE {rmse}")
print (f"ROC Score {roc}")

print (f"Accuracy {accuracy}")
print (f"Precision {precision}")
print (f"Recall {recall}")

print (f"Confusion Matrix {conf}")
print (f"Classification Report {report}")


In [None]:
model = LogisticRegression()
model.fit(X=x_train_1, y=y_train)

y_pred = model.predict(x_test_1)

mse = mean_squared_error(y_pred, y_test)
rmse = root_mean_squared_error(y_pred, y_test)
roc = roc_auc_score(y_pred, y_test)

accuracy = accuracy_score(y_pred, y_test)
precision = precision_score(y_pred, y_test)
recall = recall_score(y_pred, y_test)

conf = confusion_matrix(y_test, y_pred)
report = classification_report(y_pred, y_test)

print (f"MSE {mse}")
print (f"RMSE {rmse}")
print (f"ROC Score {roc}")

print (f"Accuracy {accuracy}")
print (f"Precision {precision}")
print (f"Recall {recall}")

print (f"Confusion Matrix {conf}")
print (f"Classification Report {report}")


### Prediction Scaling (Standard Scaling)

In [None]:
model = GaussianNB()
model.fit(X=x_train_2, y=y_train)

y_pred = model.predict(x_test_2)

mse = mean_squared_error(y_pred, y_test)
rmse = root_mean_squared_error(y_pred, y_test)
roc = roc_auc_score(y_pred, y_test)

accuracy = accuracy_score(y_pred, y_test)
precision = precision_score(y_pred, y_test)
recall = recall_score(y_pred, y_test)

conf = confusion_matrix(y_pred, y_test)
report = classification_report(y_pred, y_test)

print (f"MSE {mse}")
print (f"RMSE {rmse}")
print (f"ROC Score {roc}")

print (f"Accuracy {accuracy}")
print (f"Precision {precision}")
print (f"Recall {recall}")

print (f"Confusion Matrix {conf}")
print (f"Classification Report {report}")


In [None]:
model = LogisticRegression()
model.fit(X=x_train_2, y=y_train)

y_pred = model.predict(x_test_2)

mse = mean_squared_error(y_pred, y_test)
rmse = root_mean_squared_error(y_pred, y_test)
roc = roc_auc_score(y_pred, y_test)

accuracy = accuracy_score(y_pred, y_test)
precision = precision_score(y_pred, y_test)
recall = recall_score(y_pred, y_test)

conf = confusion_matrix(y_test, y_pred)
report = classification_report(y_pred, y_test)

print (f"MSE {mse}")
print (f"RMSE {rmse}")
print (f"ROC Score {roc}")

print (f"Accuracy {accuracy}")
print (f"Precision {precision}")
print (f"Recall {recall}")

print (f"Confusion Matrix {conf}")
print (f"Classification Report {report}")


### Kesimpulan Classification

Karena nilai Accuracy dari Logistic Regression > daripada Naive Bayes Model, maka kesimpulan yang bisa diperoleh adalah:

1. Data features yang ada pada dataset tersebut bersifat dependent terhadap satu dengan yang lainnya, karena Naive Bayes menggunakan asumsi Naive, dimana asumsi tersebut seperti menyatakan bahwa setiap fitur bersifat independent, namun kenyataannya akurasi Logistic Regression lebih tinggi, sehingga bisa dibilang data-data pada dataset ini bersifat dependent terhadap satu dengan yang lainnya.

2. Data tidak berdistribusi normal, penyebaran data tidak merata, karena Akurasi Logistic lebih tinggi

3. Dataset sudah lumayan stabil dan tidak terlalu banyak noise, karena jika banyak noise seharusnya akurasi Naive Bayes lebih tinggi, karena Naive Bayes bisa tahan terhadap noise akibat asumsi Naive tersebut.

4. Pemisahan kelas dan fitur bersifat linear, artinya lebih banyak data yang bersifat numeric dibandingkan categorical, karena Naive Bayes lebih cocok dalam Multi Classification, dimana terdapat cukup banyak kolom yang bersifat categorical.

# Predicted Values vs  Actual Values

In [None]:
# 1. Membandingkan Predicted Values dan Actual Values
plt.figure(figsize=(8, 6))

# Scatter plot antara Predicted dan Actual
plt.scatter(y_test, y_pred, color='blue', label='Data Points', alpha=0.6)
plt.plot([0, 1], [0, 1], color='red', lw=2, label="Ideal Line (y = x)")  # Garis ideal
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Predicted Values vs Actual Values')
plt.legend()
plt.show()

In [None]:
errors = np.sum(y_test != y_pred)
print(f'Jumlah kesalahan prediksi: {errors} dari {len(y_test)} sampel')

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.title('Confusion Matrix')
plt.show()