In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture

In [3]:
import pandas as pd
data = pd.read_csv('./breast-cancer-wisconsin.data')
data.columns = ['Sample code', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                'Normal Nucleoli', 'Mitoses','Class']
# data = np.array(data)
data

Unnamed: 0,Sample code,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4
...,...,...,...,...,...,...,...,...,...,...,...
693,776715,3,1,1,1,3,2,1,1,1,2
694,841769,2,1,1,1,2,1,1,1,1,2
695,888820,5,10,10,3,7,3,8,10,2,4
696,897471,4,8,6,4,3,4,10,6,1,4


In [4]:
# 取代空值
data = data.replace('?',np.NaN)
data['Bare Nuclei'].unique()
data.shape

(698, 11)

In [5]:

# 用 median() 取代 missing value
data['Bare Nuclei'].fillna(value=int(data['Bare Nuclei'].median()), inplace=True)
data['Bare Nuclei'] = data['Bare Nuclei'].astype("int")

In [6]:
# drop 
data.drop(['Sample code'], axis=1,inplace=True)
data

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,4,4,5,7,10,3,2,1,2
1,3,1,1,1,2,2,3,1,1,2
2,6,8,8,1,3,4,3,7,1,2
3,4,1,1,3,2,1,3,1,1,2
4,8,10,10,8,7,10,9,7,1,4
...,...,...,...,...,...,...,...,...,...,...
693,3,1,1,1,3,2,1,1,1,2
694,2,1,1,1,2,1,1,1,1,2
695,5,10,10,3,7,3,8,10,2,4
696,4,8,6,4,3,4,10,6,1,4


In [7]:
# 抓取資料
df_X = np.array(data.iloc[:,0:9])
df_y = np.array(data.iloc[:,-1])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, stratify=df_y)

In [9]:
# Need to separate training data
ix_2 = (y_train == 2)
train_X2 = X_train[ix_2,:]
ix_4 = (y_train == 4)
train_X4 = X_train[ix_4,:]

In [10]:

# initialize GMM here, you can assign # of mixture here
gmm2 = GaussianMixture(n_components=2)
gmm4 = GaussianMixture(n_components=2)

In [11]:
# Train the model with EM algorithm, one model per class
gmm2.fit(train_X2)
gmm4.fit(train_X4)

GaussianMixture(n_components=2)

In [12]:
# test model
s2 = gmm2.score_samples(X_test)
s4 = gmm4.score_samples(X_test)

In [13]:
# Pick the model with Maximum Likelihood
pred2 = np.maximum(s2, s4) #np.logical_and((s2> s4),)
pred4 = np.maximum(s4, s2) #np.logical_and((s4 > s2),)

In [14]:
ans2 = (y_test == 2)
ans4 = (y_test == 4)

In [15]:
acc2 = np.logical_and(pred2, ans2)
acc4 = np.logical_and(pred4, ans4)

In [16]:
acc = sum(acc2) + sum(acc4) 
acc = acc / len(y_test)
acc

1.0

In [17]:
print('Acc = %.2f' % acc)

Acc = 1.00


In [18]:
acc_tal=0
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, stratify=df_y)
    
    # Need to separate training data
    ix_2 = (y_train == 2)
    train_X2 = X_train[ix_2,:]
    ix_4 = (y_train == 4)
    train_X4 = X_train[ix_4,:]
    
    # initialize GMM here, you can assign # of mixture here
    gmm2 = GaussianMixture(n_components=2)
    gmm4 = GaussianMixture(n_components=2)
    
    # Train the model with EM algorithm, one model per class
    gmm2.fit(train_X2)
    gmm4.fit(train_X4)
    
    # test model
    s2 = gmm2.score_samples(X_test)
    s4 = gmm4.score_samples(X_test)
    
    # Pick the model with Maximum Likelihood
    pred2 = np.maximum(s2, s4) 
    pred4 = np.maximum(s4, s2)
    
    ans2 = (y_test == 2)
    ans4 = (y_test == 4)
    
    acc2 = np.logical_and(pred2, ans2)
    acc4 = np.logical_and(pred4, ans4)
    acc = sum(acc2) + sum(acc4) 
    acc = acc / len(y_test)
    acc_tal +=acc
    
print(acc_tal/10)

1.0
