In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# from sklearn.datasets import fetch_california_housing

In [2]:
dataset = load_breast_cancer()

In [3]:
data = pd.DataFrame(dataset.data)
data.columns = dataset.feature_names
print ("Shape of data:", data.shape)
data.head()

Shape of data: (569, 30)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## If detailed description of this dataset is needed uncomment the following cell

In [4]:
# print (dataset.DESCR)

## Let's run the LogisticRegression with all the features of dataset and see the results

## In dataset we don't have our Y (price) vector.
## Let's load prices of the dataset as Y vector

In [5]:
y = pd.DataFrame(dataset.target)
y.columns = ["malignant_benign"]
y.head()

Unnamed: 0,malignant_benign
0,0
1,0
2,0
3,0
4,0


## Split the data into train and test dataset, 20% dataset for testing

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=20) # If using fetch_california_housing then change column name to HouseAge else AGE

X_train.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
412,9.397,21.68,59.75,268.8,0.07969,0.06053,0.03735,0.005128,0.1274,0.06724,...,9.965,27.99,66.61,301.0,0.1086,0.1887,0.1868,0.02564,0.2376,0.09206
461,27.42,26.27,186.9,2501.0,0.1084,0.1988,0.3635,0.1689,0.2061,0.05623,...,36.04,31.37,251.2,4254.0,0.1357,0.4256,0.6833,0.2625,0.2641,0.07427
532,13.68,16.33,87.76,575.5,0.09277,0.07255,0.01752,0.0188,0.1631,0.06155,...,15.85,20.2,101.6,773.4,0.1264,0.1564,0.1206,0.08704,0.2806,0.07782
495,14.87,20.21,96.12,680.9,0.09587,0.08345,0.06824,0.04951,0.1487,0.05748,...,16.01,28.48,103.9,783.6,0.1216,0.1388,0.17,0.1017,0.2369,0.06599
13,15.85,23.95,103.7,782.7,0.08401,0.1002,0.09938,0.05364,0.1847,0.05338,...,16.84,27.66,112.0,876.5,0.1131,0.1924,0.2322,0.1119,0.2809,0.06287


## Using Logistic Regression without StandardScaler and Regularization 

In [7]:
# multi_class='ovr' -> binary classification other options are 'auto', and 'multinomial', n_job=-1 means \
# all the processors will be used for this process

lr = LogisticRegression(penalty='none', multi_class='ovr', max_iter=10000, solver='lbfgs', n_jobs=-1)
lr.fit(X_train, y_train)

  return f(*args, **kwargs)


LogisticRegression(max_iter=10000, multi_class='ovr', n_jobs=-1, penalty='none')

## Though we have set max_iter to 5000 due to default value of tol set to 0.0001 (1e-4) as soon as cost reduces less than 1e-4 training stops

In [8]:
print ("Accuracy: ", lr.score(X_train, y_train))
print ("Coefficients: ", lr.coef_)
print ("Intercept:", lr.intercept_)
print ("Number of iterations:", lr.n_iter_)

Accuracy:  0.9846153846153847
Coefficients:  [[ 2.09514582e+01 -2.81604607e-01 -1.58910686e+00 -7.99545664e-02
  -1.50151736e+01  3.89971338e+00 -2.88408445e+00 -3.28589430e+01
  -1.83646664e+00  4.06025080e+00 -2.95476156e+00  1.34780992e+00
   3.75297696e+00 -5.33626571e-01 -2.24889348e+00  2.86666444e+01
   5.00589529e+01 -2.71705018e+00  6.31374826e+00  4.72703832e+00
  -6.28925000e+00 -3.83643243e-01 -1.21400274e-01  3.74858956e-02
  -3.59808738e+01  2.28445428e+01 -1.50556687e+01 -6.06557478e+01
  -2.81692267e+01  3.84784634e+00]]
Intercept: [24.43947426]
Number of iterations: [7370]


In [9]:
print ("Accuracy with test dataset:", lr.score(X_test, y_test))

Accuracy with test dataset: 0.9649122807017544


## As we have not used regularization (penalty->none), we can probably get better or almost same accuracy with lower iterations if we were using penalty to l2 or l1