In [60]:
import pandas as pd
import numpy as np
import sklearn.model_selection as skm
from ISLP import confusion_table
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [102]:
#load the dataset
df = pd.read_csv('dataQTM.csv')

#create X and y, drop the empty column
y = df[['diagnosis']]
X = df.drop(['diagnosis', 'id', 'Unnamed: 32'], axis=1).astype('float64')

In [103]:
#use the 13 predictors that has the best results in best subset selection
X = X[['concave points_worst', 'radius_worst', 'texture_worst', 'area_worst', 'smoothness_se', 'symmetry_worst', 'compactness_se', 'radius_se', 'fractal_dimension_worst', 'compactness_mean', 'concave points_mean', 'concavity_worst', 'concavity_se']]

#split X: trainning size 70% and testing size 30%
X_train, X_test, y_train, y_test = skm.train_test_split(X, y, train_size=0.7, random_state=123)

#add constant
X_train = sm.add_constant(X_train)

#change B to be 0 and M to be 1
y_train['diagnosis'] = y_train['diagnosis'].replace({'B': 0, 'M': 1})

#build a logit model
logit_model = sm.GLM(y_train, X_train, family=sm.families.Binomial())

In [96]:
!pip3 install ISLP
from ISLP.models import summarize

#fit the logit model and print a summary
fitted_model = logit_model.fit()
summarize(fitted_model)



Unnamed: 0,coef,std err,z,P>|z|
const,-38.0917,17.468,-2.181,0.029
concave points_worst,58.7847,37.554,1.565,0.117
radius_worst,-1.6923,1.935,-0.875,0.382
texture_worst,0.4415,0.131,3.38,0.001
area_worst,0.0335,0.021,1.606,0.108
smoothness_se,634.852,253.06,2.509,0.012
symmetry_worst,20.2219,12.461,1.623,0.105
compactness_se,-223.8461,104.148,-2.149,0.032
radius_se,17.3635,5.269,3.295,0.001
fractal_dimension_worst,32.3892,95.511,0.339,0.735


In [105]:
#add constant to X_test
X_test = sm.add_constant(X_test)

#change B to be 0 and M to be 1
y_test['diagnosis'] = y_test['diagnosis'].replace({'B': 0, 'M': 1})

In [107]:
#predict y values with the fitted logit model
y_pred = fitted_model.predict(X_test)

#if the predictted value is greater than 0.5, then assign it to 1
y_pred_class = (y_pred > 0.5).astype(int)

#sum up the numbers of misclassified values
misclassified_logit = (y_pred_class != y_test['diagnosis']).sum()
total = len(y_test)

#calculate the misclassification rate
logit_misclassification_rate = misclassified_logit / total
logit_misclassification_rate

0.017543859649122806

In [109]:
#change 0 back to B and 1 back to M for the confusion table
y_pred_class= y_pred_class.replace({0:'B', 1:'M'})
y_test= y_test.replace({0:'B', 1:'M'})

In [110]:
#create a confusion table
confusion_table(y_pred_class, y_test_logit.diagnosis)

Truth,B,M
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
B,102,2
M,1,66
