In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict 
from sklearn.linear_model import LogisticRegressionCV
from sklearn import preprocessing, metrics, svm
from sklearn.metrics import accuracy_score, classification_report
from tabpy.tabpy_tools.client import Client
import warnings
warnings.filterwarnings("ignore")

In [7]:
df = pd.read_csv('data_cleaned.csv')

In [8]:
df.head()

Unnamed: 0,id,diagnosis,radius,texture,perimeter,area,smoothness,compactness,concavity,concavepoints,symmetry,fractaldimension
0,842302,Malignant,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,842517,Malignant,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,84300903,Malignant,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
3,84348301,Malignant,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744
4,84358402,Malignant,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883


## Data Preprocessing

In [9]:
df.drop('id',axis=1, inplace=True)

# Change the target variable 'diagnosis' to numeric
encoder = preprocessing.LabelEncoder()
df['diagnosis'] = encoder.fit_transform(df['diagnosis'])

In [12]:
dict(zip(encoder.classes_, encoder.fit_transform(encoder.classes_)))

{'Benign': 0, 'Malignant': 1}

In [13]:
X = np.array(df.drop('diagnosis', axis=1))
y = np.array(df['diagnosis'])

# Standardize the data
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

## SVM

In [22]:
# Define the parameter grid to use for tuning the Support Vector Machine
svm_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

# Choose the metrices for measuring the models
scoring_methods = ['f1','accuracy','precision', 'recall','roc_auc']

In [23]:
for score in scoring_methods:
    print(f"------Hyperparameter tuning for best {score}------")
    
    svmclf = GridSearchCV(svm.SVC(C=1), parameters, cv=10, scoring=score, n_jobs=-1)
    svmclf.fit(X, y)

    print("  %0.3f  for %r" % (svmclf.best_score_, svmclf.best_params_)) 
    print("\t")
    

------Hyperparameter tuning for best f1------
  0.912  for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
	
------Hyperparameter tuning for best accuracy------
  0.937  for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
	
------Hyperparameter tuning for best precision------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0.981  for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
	
------Hyperparameter tuning for best recall------
  0.897  for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
	
------Hyperparameter tuning for best roc_auc------
  0.986  for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
	


In [24]:
print("Classification report:")
y_pred = svmclf.predict(X)
print(classification_report(y, y_pred))

Classification report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       357
           1       0.98      0.91      0.94       212

    accuracy                           0.96       569
   macro avg       0.96      0.95      0.95       569
weighted avg       0.96      0.96      0.96       569



In [25]:
# Show the best model with optimal parameters
print("Best model:")
print(svmclf.best_estimator_)

Best model:
SVC(C=1000, gamma=0.001)


In [27]:
# Show accuracy and area under ROC curve
print("Accuracy: %0.3f" % accuracy_score(y, y_pred, normalize=True))
print("Auc: %0.3f" % metrics.roc_auc_score(y, y_pred))

Accuracy: 0.958
Auc: 0.947


## Logistic Regression

In [32]:
# Utilize Logistic regression with 10-fold cross-validation
lrclf = LogisticRegressionCV(Cs=list(np.power(10.0, np.arange(-10, 10))),penalty='l2',scoring='roc_auc',cv=10)
lrclf.fit(X, y)
y_pred = lrclf.predict(X)

In [33]:
print("Classification report:")
print(classification_report(y, y_pred))

Classification report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.95       357
           1       0.96      0.87      0.91       212

    accuracy                           0.94       569
   macro avg       0.94      0.93      0.93       569
weighted avg       0.94      0.94      0.94       569



In [34]:
print("Accuracy: %0.3f" % accuracy_score(y, y_pred, normalize=True))
print("Auc: %0.3f" % metrics.roc_auc_score(y, y_pred))

Accuracy: 0.938
Auc: 0.925


Thus I still choose svm for prediction.

## Deploy to Tabpy Server

In [35]:
connection = Client('http://localhost:9004/')

In [36]:
def svmpredict(var1, var2, var3, var4, var5, var6, var7, var8, var9, var10):
    X = np.column_stack([var1, var2, var3, var4, var5, var6, var7, var8, var9, var10])
    X = scaler.transform(X)
    output = svmclf.predict(X)
    return encoder.inverse_transform(output).tolist()

In [37]:
connection.deploy('SVMpredict',
                  svmpredict,
                  'Use SVM classifier to predict the cancer type based on input values', override = True)