In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.datasets import load_breast_cancer

In [3]:
breast_cancer = load_breast_cancer()

In [4]:
df = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)

In [5]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [6]:
df.isnull().any()

Unnamed: 0,0
mean radius,False
mean texture,False
mean perimeter,False
mean area,False
mean smoothness,False
mean compactness,False
mean concavity,False
mean concave points,False
mean symmetry,False
mean fractal dimension,False


In [7]:
from sklearn.model_selection import train_test_split

In [9]:
x = breast_cancer.data
y = breast_cancer.target

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=31)

In [11]:
# instantiate the lasso regression model

In [12]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression

In [13]:
lasso = Lasso()

In [14]:
# Fit the model to training datasets

lasso.fit(x_train, y_train)

In [15]:
train_score = lasso.score(x_train, y_train)
test_score = lasso.score(x_test, y_test)

In [16]:
train_score

0.5600974529893081

In [17]:
test_score

0.5832244618818156

In [18]:
lasso.coef_

array([-0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -9.10366021e-05,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00,  2.28811188e-04, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00, -3.29832072e-03, -3.76623718e-04,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00])

In [19]:
coef_used = np.sum(lasso.coef_!=0)

In [20]:
coef_used

4

In [None]:
# for default value of regularization parameter alpha 1, number of features used are 4 are having none zero out of 30 features.
# since 4 features are very less, the model is underfitting the cancet dataset.
# To reduce the effect of this underfitting, we can decrease the value of alpha and increase the number of iteration.

In [21]:
# instantiates the lasso regression model with with an alpha value of 0.01

In [24]:
lasso1 = Lasso(alpha=0.01, max_iter=1000000)

In [25]:
lasso1.fit(x_train, y_train)

In [26]:
lasso1.coef_

array([ 0.00000000e+00,  2.35980042e-03, -8.82680614e-05,  3.77264318e-04,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -1.85527229e-02, -6.63898919e-04, -0.00000000e+00, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -1.14320024e-01, -1.72229367e-02, -8.65945993e-03,  7.85597901e-04,
       -0.00000000e+00, -0.00000000e+00, -8.77575428e-02, -0.00000000e+00,
       -0.00000000e+00, -0.00000000e+00])

In [27]:
coef1_used = np.sum(lasso1.coef_!=0)

In [28]:
coef1_used

10

In [29]:
train_score1 = lasso1.score(x_train, y_train)
test_score1 = lasso1.score(x_test, y_test)

In [30]:
train_score1

0.7037865778498826

In [31]:
test_score1

0.6641831577726228

In [None]:
# For alpha = 0.01 and max iteration = 10e5, number of features are used 10.
# Training and test score increases.

In [32]:
lr = LinearRegression()

In [33]:
lr.fit(x_train, y_train)

In [34]:
lr.coef_

array([ 1.67324277e-01, -2.38350190e-03, -1.67262781e-02, -5.17432511e-04,
        2.42781026e+00,  4.11156312e+00, -1.42172340e+00, -3.13091564e+00,
       -2.71400589e-01,  1.19761170e+00, -5.41541779e-01, -2.03189135e-02,
        1.34001822e-02,  2.10709069e-03, -3.96495175e+00, -2.33378721e+00,
        4.45575182e+00, -1.45595817e+01, -1.61030139e+00,  1.42825079e+01,
       -1.88569322e-01, -7.48404826e-03,  5.43187478e-03,  8.95489127e-04,
       -2.93075605e+00, -1.77733846e-02, -4.26139986e-01,  7.40651232e-01,
       -4.52432737e-01, -5.31286802e+00])

In [35]:
train_score2 = lr.score(x_train, y_train)
test_score2 = lr.score(x_test, y_test)

In [36]:
train_score2

0.7842206194055069

In [37]:
test_score2

0.7329325010888683

In [38]:
y_pred = lr.predict(x_test)

In [39]:
y_pred

array([ 6.02348324e-01,  7.83236646e-01,  7.23492060e-01,  5.62372456e-01,
        1.07294801e+00,  8.76862637e-01,  1.06474588e+00, -1.40321001e-01,
        1.02588506e+00,  5.50475092e-01,  4.59494681e-02,  9.62380861e-01,
        4.68631018e-01,  8.67015174e-03,  1.36335882e+00,  4.41819490e-01,
        8.14941727e-01,  7.72358241e-01,  8.34017230e-01,  8.10647044e-01,
        1.00442731e+00,  7.13568075e-01,  8.39489836e-01,  1.01251729e+00,
        4.45323261e-01,  8.12867710e-01, -3.09705775e-01,  8.39290608e-01,
        8.59923009e-01,  2.17835908e-01,  9.41765329e-01,  1.81347696e-01,
        9.50688388e-01,  8.84807720e-01,  2.77736604e-01, -8.85709697e-03,
        1.13055032e+00, -1.54897721e-02, -4.21613758e-01,  1.05174818e+00,
        1.13692552e+00,  3.17591386e-01,  5.81374401e-01,  2.25887315e-01,
       -6.75209027e-02,  8.69120938e-01,  9.70615454e-01,  7.62926501e-01,
        3.90379395e-01,  6.89668192e-01,  7.14402562e-01, -1.02399647e-02,
        1.02964882e+00,  

In [40]:
from sklearn import metrics

In [41]:
print('Root Mean Squared Error : ',np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Mean Squared Error : ', metrics.mean_squared_error(y_test, y_pred))
print('R2 Score : ', metrics.r2_score(y_test, y_pred))
print('Mean Absolute Error : ', metrics.mean_absolute_error(y_test, y_pred))

Root Mean Squared Error :  0.2484411340355297
Mean Squared Error :  0.06172299708086003
R2 Score :  0.7329325010888683
Mean Absolute Error :  0.1889518730894432
