In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.metrics import f1_score, accuracy_score

In [2]:
warnings.filterwarnings('ignore')
data = pd.read_csv('data.csv', sep=',', header=0)
labels = (data['diagnosis'] == 'M')
data = data.drop(['id', 'Unnamed: 32', 'diagnosis'], 1)
train_data, test_data, y_train, y_test = train_test_split(data, labels, test_size=0.3)
y_train = y_train.astype(int)
y_test = y_test.astype(int)
print('Train_data.shape: {}'.format(train_data.shape))

Train_data.shape: (398, 30)


In [3]:
nbr = KNeighborsClassifier(n_neighbors=3)
neu = MLPClassifier(hidden_layer_sizes=(32, 8, ), activation='tanh', solver='adam')
sv = SVC(kernel='poly', degree=2, coef0=1, C=0.01, decision_function_shape='ovr', probability=True)
rat = RandomForestClassifier(n_estimators=50)
classifiers = {'KNN': nbr, 'MLP': neu, 'SVM': sv, 'RTr': rat}

km = KMeans(n_clusters=2)
gm = GaussianMixture(n_components=2)
bgm = BayesianGaussianMixture(n_components=2)
clusters = {'KMe': km, 'GM': gm, 'BGM': bgm}

estimator_ = SGDClassifier()
scale = StandardScaler()
rfe = RFECV(estimator_, cv=5)

In [4]:
Xtrain = scale.fit_transform(train_data)
Xtest = scale.transform(test_data)
X_train = rfe.fit_transform(Xtrain, y_train)
X_test = rfe.transform(Xtest)
print('RFECV: score = {0} and No. of features = {1}'.format(rfe.score(Xtrain, y_train), rfe.n_features_))
pca_components = 10

if rfe.n_features_ > pca_components:
    pca = PCA(n_components=pca_components)
    X_train = pca.fit_transform(X_train, y_train)
    X_test = pca.transform(X_test)
else:
    pca = PCA(n_components=rfe.n_features_)
    X_train = pca.fit_transform(X_train, y_train)
    X_test = pca.transform(X_test)


RFECV: score = 0.9723618090452262 and No. of features = 27


In [5]:
for name in classifiers:
    clf = classifiers[name]
    clf.fit(X_train, y_train)
    y_ = clf.predict(X_test)
    
    print('\n {0}: f_score = {1} accuracy = {2}'.format(name, f1_score(y_test, y_), 
                                                        accuracy_score(y_test, y_)))


 KNN: f_score = 0.9586776859504132 accuracy = 0.9707602339181286

 MLP: f_score = 0.9836065573770492 accuracy = 0.9883040935672515

 SVM: f_score = 0.9026548672566371 accuracy = 0.935672514619883

 RTr: f_score = 0.9206349206349206 accuracy = 0.9415204678362573


In [6]:
for name in clusters:
    clu = clusters[name]
    clu.fit(X_train)
    y_ = clu.predict(X_test)
    print('\n {0}: f_score = {1} accuracy = {2}'.format(name, f1_score(y_test, y_), 
                                                        accuracy_score(y_test, y_)))


 KMe: f_score = 0.8571428571428571 accuracy = 0.9064327485380117

 GM: f_score = 0.45454545454545453 accuracy = 0.6491228070175439

 BGM: f_score = 0.391304347826087 accuracy = 0.34502923976608185
