In [1]:
# id3 scratch
import pandas as pd
import numpy as np

df = pd.read_csv("/content/breast-cancer-wisconsin-data.csv")
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [2]:
df.dtypes

Unnamed: 0,0
id,int64
diagnosis,object
radius_mean,float64
texture_mean,float64
perimeter_mean,float64
area_mean,float64
smoothness_mean,float64
compactness_mean,float64
concavity_mean,float64
concave points_mean,float64


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [4]:
df.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
564,False
565,False
566,False
567,False


In [5]:
df.isnull().sum()

Unnamed: 0,0
id,0
diagnosis,0
radius_mean,0
texture_mean,0
perimeter_mean,0
area_mean,0
smoothness_mean,0
compactness_mean,0
concavity_mean,0
concave points_mean,0


In [6]:
df['diagnosis']=df['diagnosis'].astype('category').cat.codes.astype('float64')

In [7]:
df.dtypes

Unnamed: 0,0
id,int64
diagnosis,float64
radius_mean,float64
texture_mean,float64
perimeter_mean,float64
area_mean,float64
smoothness_mean,float64
compactness_mean,float64
concavity_mean,float64
concave points_mean,float64


In [11]:
df['diagnosis'].unique()

array([1., 0.])

In [13]:
X = df.drop('diagnosis',axis=1)
y = df['diagnosis']

In [31]:
from math import log2
target = 'diagnosis'
def entropy(series):
  probs = series.value_counts(normalize=True)
  return -sum(p*log2(p) for p in probs if p>0)

total_entropy = entropy(df[target])
print(f"total ent: {total_entropy}")

def information_gain(df,attributes,target):
  total_entropy = entropy(df[target])
  weighted_entropy = 0
  values = df[attributes].unique()
  for val in values:
    subset = df[df[attributes]==val]
    weight = len(subset)/len(df)
    sub_ent = entropy(subset[target])
    weighted_entropy += sub_ent*weight
  ig = total_entropy - weighted_entropy
  return ig

attributes = [col for col in df.columns if col not in ['id','diagnosis']]
ig_results = {}


for attr in attributes:
  ig = information_gain(df,attr,target)
  ig_results[attr]=ig
  print(f"{attr}: {ig:.2f}")\

root = max(ig_results,key=ig_results.get)
print(f"root: {root}")



total ent: 0.9526351224018599
radius_mean: 0.86
texture_mean: 0.84
perimeter_mean: 0.93
area_mean: 0.93
smoothness_mean: 0.78
compactness_mean: 0.91
concavity_mean: 0.94
concave points_mean: 0.94
symmetry_mean: 0.74
fractal_dimension_mean: 0.84
radius_se: 0.93
texture_se: 0.86
perimeter_se: 0.93
area_se: 0.93
smoothness_se: 0.94
compactness_se: 0.92
concavity_se: 0.93
concave points_se: 0.86
symmetry_se: 0.82
fractal_dimension_se: 0.92
radius_worst: 0.90
texture_worst: 0.86
perimeter_worst: 0.90
area_worst: 0.94
smoothness_worst: 0.72
compactness_worst: 0.92
concavity_worst: 0.93
concave points_worst: 0.91
symmetry_worst: 0.85
fractal_dimension_worst: 0.89
root: concave points_mean


In [32]:
import math
max_ig = max(ig_results.values())
print("max_ig (full):", max_ig)
for k, v in ig_results.items():
    if math.isclose(v, max_ig, rel_tol=1e-12, abs_tol=1e-12):
        print(k, v)


max_ig (full): 0.9420903069361305
concave points_mean 0.9420903069361305


dt

In [48]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [51]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dt = DecisionTreeClassifier(criterion='entropy',ccp_alpha=0.015)
dt.fit(X_train,y_train)
y_pred=dt.predict(X_test)
#print(y_pred)
print(f"training acc: {dt.score(X_train,y_train)}")
print(f"testing acc: {accuracy_score(y_test,y_pred)}")

training acc: 0.989010989010989
testing acc: 0.956140350877193


In [52]:
dt = DecisionTreeClassifier(criterion='gini',ccp_alpha=0.015)
dt.fit(X_train,y_train)
y_pred=dt.predict(X_test)
#print(y_pred)
print(f"training acc: {dt.score(X_train,y_train)}")
print(f"testing acc: {accuracy_score(y_test,y_pred)}")

training acc: 0.9582417582417583
testing acc: 0.9473684210526315


knn scratch

In [69]:
from sklearn.metrics import accuracy_score
from collections import Counter

def knn(X_train,y_train,X_test,k=3):
  preds = []
  for x in X_test:
    dist = np.sqrt(np.sum((X_train-x)**2,axis=1))
    k_idx = np.argsort(dist)[:k]
    k_labels = y_train[k_idx]
    preds.append(Counter(k_labels).most_common(1)[0][0])
  return np.array(preds)

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

y_pred = knn(X_train,y_train,X_test,k=3)
y_train_pred = knn(X_train,y_train,X_train,k=3)

print(f"trainng acc: {accuracy_score(y_train,y_train_pred)}")
print(f"test acc: {accuracy_score(y_test,y_pred)}")

trainng acc: 0.9098901098901099
test acc: 0.7631578947368421


In [70]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(X_train,y_train)
y_pred=KNN.predict(X_test)
print(f"training acc: {KNN.score(X_train,y_train)}")
print(f"test acc: {accuracy_score(y_test,y_pred)}")

training acc: 0.9098901098901099
test acc: 0.7631578947368421


In [75]:
#validation techniques

#k-fold
from sklearn.model_selection import KFold
kf = KFold(n_splits=3,shuffle=True,random_state=1)
scores = []
for train_idx,test_idx in kf.split(X):
  X_train,X_test = X.iloc[train_idx],X.iloc[test_idx]
  y_train,y_test = y.iloc[train_idx],y.iloc[test_idx]
  pred = knn(np.array(X_train),np.array(y_train),np.array(X_test),k=3)
  acc = accuracy_score(np.array(y_test),pred)
  scores.append(acc)

  print(f"fold: {scores}")
print(f"fold mean: {np.mean(scores)}")

fold: [0.8157894736842105]
fold: [0.8157894736842105, 0.7894736842105263]
fold: [0.8157894736842105, 0.7894736842105263, 0.6984126984126984]
fold mean: 0.7678919521024783


In [86]:
from sklearn.model_selection import KFold,cross_val_score
from sklearn.neighbors import KNeighborsClassifier

kf = KFold(n_splits=3,shuffle=True,random_state=1)
knn = KNeighborsClassifier(n_neighbors=3)
scores = cross_val_score(knn,X,y,cv=kf)
print(scores.mean())

0.7678919521024783


In [77]:
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = []
for train_idx,test_idx in loo.split(X):
  X_train,X_test = X.iloc[train_idx],X.iloc[test_idx]
  y_train,y_test = y.iloc[train_idx],y.iloc[test_idx]
  pred = knn(np.array(X_train),np.array(y_train),np.array(X_test),k=3)
  acc = accuracy_score(np.array(y_test),pred)
  scores.append(acc)

print(f"fold: {scores}")
print(f"fold mean: {np.mean(scores)}")


fold: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0

In [83]:
from sklearn.model_selection import LeaveOneOut,cross_val_score
loo = LeaveOneOut()
model = KNeighborsClassifier(n_neighbors=3)
scores = cross_val_score(model,X,y,cv=loo)
print(scores.mean())

0.7855887521968365


ensemble learning

In [87]:
df.describe()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,30371830.0,0.372583,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,125020600.0,0.483918,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,8670.0,0.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,869218.0,0.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,906024.0,0.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,8813129.0,1.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,911320500.0,1.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [100]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
x_train,x_val,y_train,y_val = train_test_split(x_train,y_train,test_size=0.3,random_state=0)
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
x_val_scaled = scaler.transform(x_val)

In [105]:
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

ada = AdaBoostClassifier(random_state=1)
ada.fit(x_train_scaled,y_train)
ada_pred = ada.predict(x_test_scaled)
print(f"ada test acc: {accuracy_score(y_test,ada_pred):.2f}")
print(f"training acc: {ada.score(x_train_scaled,y_train)}")

xgb = XGBClassifier(random_state=1)
xgb.fit(x_train_scaled,y_train)
xgb_pred = xgb.predict(x_test_scaled)
print(f"\nxgb test acc: {accuracy_score(y_test,xgb_pred):.2f}")
print(f"xgb train acc: {xgb.score(x_train_scaled,y_train)}")

rf = RandomForestClassifier(random_state=1)
rf.fit(x_train_scaled,y_train)
rf_pred = rf.predict(x_test_scaled)
print(f"\nrf test acc: {accuracy_score(y_test,rf_pred):.2f}")
print(f"train acc: {rf.score(x_train_scaled,y_train)}")

ada test acc: 0.96
training acc: 1.0

xgb test acc: 0.98
xgb train acc: 1.0

rf test acc: 0.97
train acc: 1.0


In [106]:
df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [110]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
X = df[['radius_mean','texture_mean']]
y = df['diagnosis']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

knn = KNeighborsClassifier()
dt = DecisionTreeClassifier(random_state=1)
rf = RandomForestClassifier(random_state=1)
xgb = XGBClassifier(random_state=1)

estimators = [('DT',dt),('KNN',knn),('RF',rf),('XGB',xgb)]
model = VotingClassifier(estimators=estimators,voting='soft')
model.fit(X_train,y_train)
m_pred = model.predict(X_test)
print(f"test acc soft: {accuracy_score(y_test,m_pred):.2f}")
print(f"train acc soft: {model.score(X_train,y_train):.2f}")


estimators = [('DT',dt),('KNN',knn),('RF',rf),('XGB',xgb)]
model = VotingClassifier(estimators=estimators,voting='hard')
model.fit(X_train,y_train)
m_pred = model.predict(X_test)
print(f"\ntest acc hard: {accuracy_score(y_test,m_pred):.2f}")
print(f"train acc hard: {model.score(X_train,y_train):.2f}")


test acc soft: 0.92
train acc soft: 1.00

test acc hard: 0.92
train acc hard: 1.00
