# Classification (Naive Bayes)

Credit: Kalvin

#Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

#Data Preparation

In [None]:
data = pd.read_csv("Breast_Cancer_Classification.csv")
data

In [None]:
data_out = data.pop("diagnosis")
data["diagnosis"] = data_out

In [None]:
data = data.drop(columns = ["id", "Unnamed: 32"])

In [None]:
data.columns

In [None]:
temp_columns = []
for x in data.columns :
  temp_columns.append(x.replace(" ", "_"))
data.columns = temp_columns
print(data.columns)

#EDA

In [None]:
data.dtypes

In [None]:
data.shape

In [None]:
data["diagnosis"].value_counts()

In [None]:
data.describe()

In [None]:
data["diagnosis"].value_counts()

In [None]:
# for column in data.columns[:-1] :
#   sns.histplot(data = data, x = column, hue = "diagnosis")
#   plt.show()

Kategori M dan B memiliki perbedaan yang cukup signifikan, hal ini dapat diamati pada :
radius_mean : M antara 10-15, B antara 11-25
texture_mean : M antara 10-20, B antara 15-34
perimeter_mean : M antara 40 - 100, B antara 80 - 180
dan lain-lain, sehingga hal ini dapat mempermudah mesin/model dalam mengklasifikasi data yang diberikan.


In [None]:
# for column in data.columns[:-1] :
#   sns.boxplot(data = data, x = column, hue = "diagnosis")
#   plt.show()

radius_se dan area_se, smoothness_se, concavity_se terlihat memiliki outlier yang cukup parah.

In [None]:
# for i in range(len(data.columns) - 1) :
#   for j in range(i + 1, len(data.columns) - 1) :
#     if(data[data.columns[i]].dtype != "object" and data[data.columns[j]].dtype != "object") :
#       sns.scatterplot(data = data, x = data.columns[i], y = data.columns[j], hue = "diagnosis")
#       plt.show()

radius_mean & perimeter_mean : linear (semakin besar radius_mean, maka semakin besar pula perimeter_mean nya).
radius_mean & area_mean : linear (semakin besar radius_mean, maka semakin besar pula area_mean nya).
Dan dapat kita lihat, bahwasannya distribusi/penyebaran antara kategori M dan B itu cukup merata dan terlihat jelas cluster-nya.

In [None]:
# sns.pairplot(data = data, hue = "diagnosis")
# plt.show()

In [None]:
# for column in data.columns[:-1] :
#   if(data[column].dtype != "object") :
#     print(f"column : {column}, skewness : {data[column].skew()}, kurtosis : {data[column].kurt()}")

#Feature Engineering

In [None]:
missing_percentage = data.isna().sum() / len(data) * 100
missing_percentage

In [None]:
for key, value in missing_percentage.items() :
  if(value > 50.0) :
    data = data.drop(columns = [key], axis = 1)

In [None]:
data.isna().sum()

In [None]:
encoder = LabelEncoder()
encoded = {}
for column in data.columns :
  if(data[column].dtype == "object") :
    data[column] = encoder.fit_transform(data[column])
    encoded[column] = {i:class_name for i, class_name in enumerate(encoder.classes_)}
for key, value in encoded.items() :
  print(f"key : {key}, value : {value}")

In [None]:
x = data[data.columns[:-1]]
y = data[data.columns[-1]]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [None]:
def correlation(data_df, threshold) :
  corr_col = set()
  corr_matrix = data_df.corr()
  print(corr_matrix)
  for i in range(len(corr_matrix.columns)) :
    for j in range(i + 1, len(corr_matrix.columns)) :
      if(np.abs(corr_matrix.iloc[(i, j)]) > threshold) :
        corr_col.add(corr_matrix.columns[j])
  return corr_col

In [None]:
corr_col = correlation(x_train, threshold = 0.9)
print(corr_col)
print(len(corr_col))

In [None]:
x_train = x_train.drop(columns = corr_col, axis = 1)
x_test = x_test.drop(columns = corr_col, axis = 1)

In [None]:
# y_train = np.reshape(y_train, (-1, 1))
# y_test = np.reshape(y_test, (-1, 1))

In [None]:
scaler_x_minmax = MinMaxScaler()
scaler_x_standard = StandardScaler()

In [None]:
def scaling(x1, x2, scaler_x) :
  x1 = scaler_x.fit_transform(x1)
  x2 = scaler_x.transform(x2)
  return x1, x2, scaler_x

In [None]:
x_train1, x_test1, scaler_x_minmax = scaling(x_train, x_test, scaler_x_minmax)

In [None]:
x_train2, x_test2, scaler_x_standard = scaling(x_train, x_test, scaler_x_standard)

#Modeling

In [None]:
model = LogisticRegression() #LogisticRegression with MinMaxScaler()
history = model.fit(x_train1, y_train)
y_pred = model.predict(x_test1)
y_pred_prob = model.predict_proba(x_test1)[:, 1]
print(f"accuracy_score : {accuracy_score(y_test, y_pred)}, recall_score : {recall_score(y_test, y_pred)}, precision_score : {precision_score(y_test, y_pred)}, roc_auc_score : {roc_auc_score(y_test, y_pred_prob)}")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
model = LogisticRegression() #LogisticRegression with StandardScaler()
history = model.fit(x_train2, y_train)
y_pred = model.predict(x_test2)
y_pred_prob = model.predict_proba(x_test2)[:, 1]
print(f"accuracy_score : {accuracy_score(y_test, y_pred)}, recall_score : {recall_score(y_test, y_pred)}, precision_score : {precision_score(y_test, y_pred)}, roc_auc_score : {roc_auc_score(y_test, y_pred_prob)}")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
model = GaussianNB() #GaussianNB with MinMaxScaler()
history = model.fit(x_train1, y_train)
y_pred = model.predict(x_test1)
y_pred_prob = model.predict_proba(x_test1)[:, 1]
print(f"accuracy_score : {accuracy_score(y_test, y_pred)}, recall_score : {recall_score(y_test, y_pred)}, precision_score : {precision_score(y_test, y_pred)}, roc_auc_score : {roc_auc_score(y_test, y_pred_prob)}")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
model = GaussianNB() #GaussianNB with StandardScaler()
history = model.fit(x_train2, y_train)
y_pred = model.predict(x_test2)
y_pred_prob = model.predict_proba(x_test2)[:, 1]
print(f"accuracy_score : {accuracy_score(y_test, y_pred)}, recall_score : {recall_score(y_test, y_pred)}, precision_score : {precision_score(y_test, y_pred)}, roc_auc_score : {roc_auc_score(y_test, y_pred_prob)}")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', label='Data Points')
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', lw=2, label="Ideal Line (y = x)")