IMPORTING DEPENDENCIES

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

IMPORTING DATASET

In [20]:
from sklearn.datasets import load_breast_cancer

In [21]:
data = load_breast_cancer()   #loading the dataset

In [24]:
X = pd.DataFrame(data.data, columns = data.feature_names)
y = pd.Series(data.target, name = "target")

EXPLORING THE DATASET

In [28]:
X.shape

(569, 30)

In [29]:
y.shape

(569,)

In [30]:
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [32]:
X.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
564,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,...,25.45,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,23.69,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637
566,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,0.05648,...,18.98,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,25.74,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124
568,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,0.05884,...,9.456,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039


TRAINING

In [41]:
model = LinearRegression()    # model initialization

In [42]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)  #divide the dataset for testing-training

In [43]:
model.fit(X,y)  #trainign the model

TESTING

In [44]:
y_pred = model.predict(X_test)  #predictions

EVALUATION

In [45]:
from sklearn.metrics import r2_score,mean_squared_error

In [48]:
mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

In [50]:
print(mse,r2)  #r2 not meaningful for binary classification

0.05271193757404104 0.7756160036972691


UNDERSTANDING COEFFICIENTS

In [51]:
coefficients = pd.Series(model.coef_, index=X.columns)
coefficients.sort_values(ascending=False).head()


Unnamed: 0,0
fractal dimension error,7.14644
mean compactness,4.222035
concavity error,3.565468
mean radius,0.217772
perimeter error,0.02252


In [52]:
# i.e fractional dimension error impacts the target most, and perimeter has the least impact on the target

REGULARIZATION

In [61]:
# L1--> LASSO REGULARIZATION
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
y_pred_lasso = lasso.predict(X_test)
print("Lasso R2 score: ", r2_score(y_test,y_pred_lasso))

Lasso R2 score:  0.6893625330713165


In [59]:
# L2---> RIDGE REGRESSION
from sklearn.linear_model import Ridge
ridge = Ridge(alpha = 0.1)
ridge.fit(X_train,y_train)
y_pred_ridge = ridge.predict(X_test)
print("Ridge R2 score: ", r2_score(y_test,y_pred_ridge))

Ridge R2 score:  0.757018703246533


In [None]:
# high dimensional data ----> regression line cant be plotted