In [1]:
import pandas as pd
import numpy as np
import sklearn.model_selection as skm
from ISLP import confusion_table
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [2]:
#load the dataset
df = pd.read_csv('dataQTM.csv')

#create X and y, drop the empty column
y = df[['diagnosis']]
X = df.drop(['diagnosis', 'id', 'Unnamed: 32'], axis=1).astype('float64')

In [3]:
#use the 13 predictors that has the best results in backward subset selection
X = X[['compactness_mean', 'concave points_mean', 'radius_se', 'smoothness_se', 'concavity_se', 'concave points_se', 'radius_worst', 'texture_worst', 'area_worst', 'concavity_worst', 'symmetry_worst', 'fractal_dimension_worst']]

#split X: trainning size 70% and testing size 30%
X_train, X_test, y_train, y_test = skm.train_test_split(X, y, train_size=0.7, random_state=123)

#add constant
X_train = sm.add_constant(X_train)

#change B to be 0 and M to be 1
y_train['diagnosis'] = y_train['diagnosis'].replace({'B': 0, 'M': 1})

#build a logit model
logit_model = sm.GLM(y_train, X_train, family=sm.families.Binomial())

In [4]:
!pip3 install ISLP
from ISLP.models import summarize

#fit the logit model and print a summary
fitted_model = logit_model.fit()
summarize(fitted_model)



Unnamed: 0,coef,std err,z,P>|z|
const,-30.4565,14.542,-2.094,0.036
compactness_mean,-49.7076,34.263,-1.451,0.147
concave points_mean,83.3484,58.551,1.424,0.155
radius_se,9.5727,3.816,2.508,0.012
smoothness_se,345.1333,264.126,1.307,0.191
concavity_se,-146.3706,75.458,-1.94,0.052
concave points_se,243.2589,269.317,0.903,0.366
radius_worst,-0.7451,1.623,-0.459,0.646
texture_worst,0.3329,0.091,3.669,0.0
area_worst,0.021,0.016,1.282,0.2


In [5]:
#add constant to X_test
X_test = sm.add_constant(X_test)

#change B to be 0 and M to be 1
y_test['diagnosis'] = y_test['diagnosis'].replace({'B': 0, 'M': 1})

In [6]:
#predict y values with the fitted logit model
y_pred = fitted_model.predict(X_test)

#if the predictted value is greater than 0.5, then assign it to 1
y_pred_class = (y_pred > 0.5).astype(int)

#sum up the numbers of misclassified values
misclassified_logit = (y_pred_class != y_test['diagnosis']).sum()
total = len(y_test)

#calculate the misclassification rate
logit_misclassification_rate = misclassified_logit / total
logit_misclassification_rate

0.005847953216374269

In [7]:
#change 0 back to B and 1 back to M for the confusion table
y_pred_class= y_pred_class.replace({0:'B', 1:'M'})
y_test= y_test.replace({0:'B', 1:'M'})

In [9]:
#create a confusion table
confusion_table(y_pred_class, y_test.diagnosis)

Truth,B,M
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
B,103,1
M,0,67
