In [126]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets

df = datasets.load_breast_cancer(as_frame=True)

X = df.data
y = df.target

print(X.shape, y.shape)

(569, 30) (569,)


In [31]:
X1 = X.loc[:, X.columns.str.contains("mean")]
X2 = X.loc[:, ~X.columns.str.contains("mean|worst")]
X3 = X.loc[:, X.columns.str.contains("worst")]

In [None]:
for col in X.columns:
    corr = X[col].corr(y)
    print(f"{col:30s} : {corr:.3f}")

In [127]:
selected_columns = [col for col in X.columns if abs(X[col].corr(y)) > 0.2]
X = X.loc[:, selected_columns]

In [107]:
max_col = max(selected_columns, key=lambda c: abs(X[c].corr(y)))
X = X[max_col]

In [128]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [129]:
y = y.to_numpy().reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [130]:
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)

pred = lin_reg.predict(X_test)

mse = np.sqrt(mean_squared_error(y_test,pred))
print(f'Mean error: {mse:3.3} ({mse/np.mean(pred)*100:3.3}%)')

score = lin_reg.score(X_train,y_train)
print('Model determination: ', score)

Mean error: 0.251 (41.4%)
Model determination:  0.7730811579528589


In [131]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(StandardScaler(), PolynomialFeatures(2), LinearRegression())
pipeline.fit(X_train,y_train)

pred = pipeline.predict(X_test)

mse = np.sqrt(mean_squared_error(y_test,pred))
print(f'Mean error: {mse:3.3} ({mse/np.mean(pred)*100:3.3}%)')

score = pipeline.score(X_train,y_train)
print('Model determination: ', score)

Mean error: 1.66 (2.02e+02%)
Model determination:  0.9701253877724552
