In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn.decomposition import PCA
import seaborn as sns

## 1. Read in data and explore

In [None]:
data = pd.read_excel(r"/content/breast+cancer+coimbra.xlsx")
X = data.drop(['Classification'],axis=1)
Y = data['Classification']

data.head()

In [None]:
data.describe()

## 2. Feature analysis

### Look at correlations between features
Note correlations between HOMA and Insulin, HOMA and Glucose, BMI and Leptin, Insulin and Glucose. Glucose has highest correlation with Classification

In [None]:
cor = data.corr()
pl.figure(figsize = (8,8))
sns.heatmap(cor,annot = True,fmt = '.2f',linewidths = 1.5, cmap = 'RdBu')
pl.show()

### Principle Component Analysis
No apparent clustering

In [None]:
pca = PCA(n_components=5)
principalComponents = pca.fit_transform(X)
X_pca = pd.DataFrame(data = principalComponents)
X_pca = pd.concat([X_pca.reset_index().drop(['index'],axis=1),Y.reset_index().drop(['index'],axis=1)], axis=1)
X_pca.plot.scatter(x=0,y=1,c='Classification',colormap='RdBu')
pl.show()

## 3. Generate and test classification models

### Split data into train and test set

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 10)

### Scale the data

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Logistic regression model

In [None]:
logistic_model = LogisticRegression(random_state = 10)
logistic_model.fit(X_train, Y_train)
Y_pred = logistic_model.predict(X_test)

print('train score: '+str(logistic_model.score(X_train,Y_train)))
print('test score:  '+str(logistic_model.score(X_test,Y_test)))

ac=accuracy_score(Y_test, Y_pred)
print("Acc:",accuracy_score(Y_test, Y_pred))
cm = confusion_matrix(Y_test, Y_pred)
sns.heatmap(cm, annot = True, fmt = '.2f',
            xticklabels = ['Healthy', 'Cancer'], yticklabels = ['Healthy', 'Cancer'])
pl.ylabel('True Class')
pl.xlabel('Predicted Class')
pl.show()

### K-nearest neighbors model

In [None]:
kn_model = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
kn_model.fit(X_train, Y_train)
Y_pred = kn_model.predict(X_test)

print('train score: '+str(kn_model.score(X_train,Y_train)))
print('test score:  '+str(kn_model.score(X_test,Y_test)))

ac=accuracy_score(Y_test, Y_pred)
print("Acc:",accuracy_score(Y_test, Y_pred))
cm = confusion_matrix(Y_test, Y_pred)
sns.heatmap(cm, annot = True, fmt = '.2f',
            xticklabels = ['Healthy', 'Cancer'], yticklabels = ['Healthy', 'Cancer'])
pl.ylabel('True Class')
pl.xlabel('Predicted Class')
pl.show()

### Linear SVM model

In [None]:
lsvc_model = SVC(kernel = 'linear', random_state = 10)
lsvc_model.fit(X_train, Y_train)
Y_pred = lsvc_model.predict(X_test)

print('train score: '+str(lsvc_model.score(X_train,Y_train)))
print('test score:  '+str(lsvc_model.score(X_test,Y_test)))

ac=accuracy_score(Y_test, Y_pred)
print("Acc:",accuracy_score(Y_test, Y_pred))
cm = confusion_matrix(Y_test, Y_pred)
sns.heatmap(cm, annot = True, fmt = '.2f',
            xticklabels = ['Healthy', 'Cancer'], yticklabels = ['Healthy', 'Cancer'])
pl.ylabel('True Class')
pl.xlabel('Predicted Class')
pl.show()

### Naive Bayes model

In [None]:
nb_model = GaussianNB()
nb_model.fit(X_train, Y_train)
Y_pred = nb_model.predict(X_test)

print('train score: '+str(nb_model.score(X_train,Y_train)))
print('test score:  '+str(nb_model.score(X_test,Y_test)))

ac=accuracy_score(Y_test, Y_pred)
print("Acc:",accuracy_score(Y_test, Y_pred))
cm = confusion_matrix(Y_test, Y_pred)
sns.heatmap(cm, annot = True, fmt = '.2f',
            xticklabels = ['Healthy', 'Cancer'], yticklabels = ['Healthy', 'Cancer'])
pl.ylabel('True Class')
pl.xlabel('Predicted Class')
pl.show()

### Decision tree model

In [None]:
dt_model = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dt_model.fit(X_train, Y_train)
Y_pred = dt_model.predict(X_test)

print('train score: '+str(dt_model.score(X_train,Y_train)))
print('test score:  '+str(dt_model.score(X_test,Y_test)))

ac=accuracy_score(Y_test, Y_pred)
print("Acc:",accuracy_score(Y_test, Y_pred))
cm = confusion_matrix(Y_test, Y_pred)
sns.heatmap(cm, annot = True, fmt = '.2f',
            xticklabels = ['Healthy', 'Cancer'], yticklabels = ['Healthy', 'Cancer'])
pl.ylabel('True Class')
pl.xlabel('Predicted Class')
pl.show()

In [None]:
importances = list(dt_model.feature_importances_)
feature_list = list(data.columns)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('{:20} Importance: {}'.format(*pair)) for pair in feature_importances];

In [None]:
importances = list(et_model.feature_importances_)
feature_list = list(data.columns)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('{:20} Importance: {}'.format(*pair)) for pair in feature_importances];

### Random forest model

In [None]:
forest_model = RandomForestClassifier(n_estimators = 5, criterion = 'entropy', random_state = 10)
forest_model.fit(X_train, Y_train)
Y_pred = forest_model.predict(X_test)

print('train score: '+str(forest_model.score(X_train,Y_train)))
print('test score:  '+str(forest_model.score(X_test,Y_test)))

ac=accuracy_score(Y_test, Y_pred)
print("Acc:",accuracy_score(Y_test, Y_pred))
cm = confusion_matrix(Y_test, Y_pred)
sns.heatmap(cm, annot = True, fmt = '.2f',
            xticklabels = ['Healthy', 'Cancer'], yticklabels = ['Healthy', 'Cancer'])
pl.ylabel('True Class')
pl.xlabel('Predicted Class')
pl.show()

In [None]:
importances = list(forest_model.feature_importances_)
feature_list = list(data.columns)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('{:20} Importance: {}'.format(*pair)) for pair in feature_importances];

In [None]:
models

Best performers are Logistic, Linear SVM, and Extra Tree. As expected, Glucose is the feature with highest importance for all the decision tree based models.

# 4. Conclusion
The best-performing model was Logistic Regression, with sensitivity and specificity of 83%. The top predictive features were Glucose, Age, Resistin, BMI, and Insulin. These results indicate that there is some relationship between obesity/metabolic disregulation and breast cancer, and that this model may be used as a biomarker of breast cancer.