# **Logistic Regression in R on Wisconsin Breast cancer (diagnostic) UCI data**

## **Importing the dataset**

In [1]:
ds = read.csv('/content/data.csv')
cat("First three rows of dataset", "\n")
head(ds)

First three rows of dataset 


Unnamed: 0_level_0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave.points_mean,⋯,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave.points_worst,symmetry_worst,fractal_dimension_worst,X
Unnamed: 0_level_1,<int>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>
1,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,⋯,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
2,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,⋯,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
3,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,⋯,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
4,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,⋯,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
5,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,⋯,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,
6,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,⋯,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244,


## **Data preprocessing**

In [2]:
#find number of rows with missing values
sum(!complete.cases(ds$X))

In [3]:
ds = ds[-33]
head(ds)

Unnamed: 0_level_0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave.points_mean,⋯,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave.points_worst,symmetry_worst,fractal_dimension_worst
Unnamed: 0_level_1,<int>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,⋯,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
2,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,⋯,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
3,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,⋯,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
4,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,⋯,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
5,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,⋯,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
6,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,⋯,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


In [4]:
#find number of rows with missing values
sum(!complete.cases(ds))

In [5]:
# Omit any null rows
ds = na.omit(ds)

## **Encoding the target feature as factor**

In [6]:
ds$diagnosis = factor(ds$diagnosis,levels = c('B', 'M'), labels = c(0, 1))
head(ds)

Unnamed: 0_level_0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave.points_mean,⋯,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave.points_worst,symmetry_worst,fractal_dimension_worst
Unnamed: 0_level_1,<int>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,⋯,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
2,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,⋯,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
3,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,⋯,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
4,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,⋯,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
5,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,⋯,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
6,843786,1,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,⋯,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


## **Splitting the dataset into the Train set and Test set**

In [7]:
ds = ds[-1]
head(ds)

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave.points_mean,symmetry_mean,⋯,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave.points_worst,symmetry_worst,fractal_dimension_worst
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,⋯,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
2,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,⋯,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
3,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,⋯,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
4,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,⋯,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
5,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,⋯,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
6,1,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,⋯,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


In [8]:
install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(ds$diagnosis, SplitRatio = 0.8)
head(split)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [9]:
train_set = subset(ds, split == TRUE)
test_set = subset(ds, split == FALSE)

In [10]:
print(head(train_set, 3))

  diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean
1         1       17.99        10.38          122.8      1001         0.11840
2         1       20.57        17.77          132.9      1326         0.08474
3         1       19.69        21.25          130.0      1203         0.10960
  compactness_mean concavity_mean concave.points_mean symmetry_mean
1          0.27760         0.3001             0.14710        0.2419
2          0.07864         0.0869             0.07017        0.1812
3          0.15990         0.1974             0.12790        0.2069
  fractal_dimension_mean radius_se texture_se perimeter_se area_se
1                0.07871    1.0950     0.9053        8.589  153.40
2                0.05667    0.5435     0.7339        3.398   74.08
3                0.05999    0.7456     0.7869        4.585   94.03
  smoothness_se compactness_se concavity_se concave.points_se symmetry_se
1      0.006399        0.04904      0.05373           0.01587     0.03003


In [11]:
print(head(test_set, 3))

  diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean
4         1       11.42        20.38          77.58     386.1          0.1425
5         1       20.29        14.34         135.10    1297.0          0.1003
8         1       13.71        20.83          90.20     577.9          0.1189
  compactness_mean concavity_mean concave.points_mean symmetry_mean
4           0.2839        0.24140             0.10520        0.2597
5           0.1328        0.19800             0.10430        0.1809
8           0.1645        0.09366             0.05985        0.2196
  fractal_dimension_mean radius_se texture_se perimeter_se area_se
4                0.09744    0.4956     1.1560        3.445   27.23
5                0.05883    0.7572     0.7813        5.438   94.44
8                0.07451    0.5835     1.3770        3.856   50.96
  smoothness_se compactness_se concavity_se concave.points_se symmetry_se
4      0.009110        0.07458      0.05661           0.01867     0.05963


## **Feature Scaling**

In [12]:
train_set[-1] = scale(train_set[-1])
test_set[-1] = scale(test_set[-1])

In [13]:
print(head(train_set, 2))

  diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean
1         1    1.092091   -2.0781202       1.268815 0.9880026       1.6684372
2         1    1.827381   -0.3731651       1.686555 1.9218924      -0.8419933
  compactness_mean concavity_mean concave.points_mean symmetry_mean
1        3.3779611    2.708037423           2.5608895   2.279832090
2       -0.4705914   -0.005159546           0.5643063   0.009980093
  fractal_dimension_mean radius_se texture_se perimeter_se   area_se
1              2.3320653 2.5981471 -0.5589606    2.9944034 2.6659287
2             -0.8571177 0.5205996 -0.8679125    0.2817483 0.7977976
  smoothness_se compactness_se concavity_se concave.points_se symmetry_se
1    -0.1802841      1.3387835    0.6963763         0.6723936   1.1354459
2    -0.5778782     -0.6762646   -0.4190520         0.2709684  -0.8088852
  fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
1           0.87629700     1.875426    -1.3817534       

In [14]:
print(head(test_set, 2))

  diagnosis radius_mean texture_mean perimeter_mean  area_mean smoothness_mean
4         1   -0.717954    0.3570746     -0.5533431 -0.7040368       2.7239964
5         1    1.747317   -1.0952258      1.7583918  1.7664138       0.1583678
  compactness_mean concavity_mean concave.points_mean symmetry_mean
4        3.0529650       1.739499            1.357524    2.57983265
5        0.4021003       1.224434            1.335021   -0.03938653
  fractal_dimension_mean radius_se texture_se perimeter_se    area_se
4              4.4434679 0.2824113 -0.1232779    0.2272455 -0.2421738
5             -0.6295769 1.0946534 -0.8156036    1.0509880  0.9514207
  smoothness_se compactness_se concavity_se concave.points_se symmetry_se
4     0.5130086      2.6281003     1.000576          1.057123   4.8230188
5     1.2636454     -0.1209556     1.011680          1.085876  -0.3348095
  fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
4              2.50347   -0.2570713     0.1676625 

## **Fitting Logistic Regression to the Train set**

In [15]:
train_set = train_set[-3]
test_set = test_set[-3]

In [16]:
#classifier = glm(formula = diagnosis ~ ., family=binomial("logit"), data = train_set)
classifier = glm(formula = diagnosis ~ ., family=binomial, data = train_set)

“glm.fit: algorithm did not converge”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”


## **Predicting the Test set results**

In [17]:
prob_pred = predict(classifier, type = 'response', newdata = test_set)
y_pred = ifelse(prob_pred > 0.5, 1, 0)

## **Generate the Confusion Matrix**

In [18]:
cm = table(y_pred > 0.5, test_set[, 1])
cm

       
         0  1
  FALSE 64  4
  TRUE   7 38

## **Evaluation Metrics**

In [19]:
 n = sum(cm) # number of instances
 nc = nrow(cm) # number of classes
 diag = diag(cm) # number of correctly classified instances per class 
 rowsums = apply(cm, 1, sum) # number of instances per class
 colsums = apply(cm, 2, sum) # number of predictions per class
 p = rowsums / n # distribution of instances over the actual classes
 q = colsums / n # distribution of instances over the predicted classes

In [20]:
accuracy = sum(diag) / n 
accuracy 

In [21]:
precision = diag / colsums 
recall = diag / rowsums 
f1 = 2 * precision * recall / (precision + recall) 
data.frame(precision, recall, f1) 

Unnamed: 0_level_0,precision,recall,f1
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>
0,0.9014085,0.9411765,0.9208633
1,0.9047619,0.8444444,0.8735632


In [22]:
summary(classifier)


Call:
glm(formula = diagnosis ~ ., family = binomial, data = train_set)

Deviance Residuals: 
       Min          1Q      Median          3Q         Max  
-1.942e-04  -2.100e-08  -2.100e-08   2.100e-08   2.204e-04  

Coefficients:
                          Estimate Std. Error z value Pr(>|z|)
(Intercept)              1.093e+02  7.499e+04   0.001    0.999
radius_mean              9.050e+02  1.221e+06   0.001    0.999
perimeter_mean          -9.033e+02  9.997e+05  -0.001    0.999
area_mean               -3.460e+02  4.433e+05  -0.001    0.999
smoothness_mean          6.511e+01  3.452e+04   0.002    0.998
compactness_mean        -1.215e+02  4.663e+04  -0.003    0.998
concavity_mean          -8.803e+00  7.688e+04   0.000    1.000
concave.points_mean      1.966e+02  7.326e+04   0.003    0.998
symmetry_mean           -4.074e+01  2.009e+04  -0.002    0.998
fractal_dimension_mean   4.177e+00  2.944e+04   0.000    1.000
radius_se                2.391e+01  1.469e+05   0.000    1.000
texture_se  