In [1]:
from sklearn.datasets import load_breast_cancer

In [50]:
import pandas as pd
import numpy as np

In [41]:
brc = load_breast_cancer()

In [42]:
brc.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [43]:
brc.target_names

array(['malignant', 'benign'], dtype='<U9')

In [46]:
X = pd.DataFrame(data=brc.data, columns=brc.feature_names)
y = pd.DataFrame(brc.target, columns=['Target'])

In [47]:
X.shape, y.shape

((569, 30), (569, 1))

In [48]:
y.Target.value_counts()

1    357
0    212
Name: Target, dtype: int64

In [49]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [53]:
np.corrcoef(X['mean perimeter'],X['mean radius'])

array([[1.        , 0.99785528],
       [0.99785528, 1.        ]])

In [55]:
from sklearn.impute import SimpleImputer

In [56]:
mat = np.array([[7, 16, 31], [np.nan, np.nan, 66], [12, 5, np.nan],[98, np.nan, 92]])

In [58]:
si = SimpleImputer(strategy='mean')
si.fit_transform(mat)

array([[ 7. , 16. , 31. ],
       [39. , 10.5, 66. ],
       [12. ,  5. , 63. ],
       [98. , 10.5, 92. ]])

In [59]:
from sklearn.preprocessing import FunctionTransformer

In [60]:
mat = np.array([[1, 1], [2, 3],[10,100]])

In [61]:
ft = FunctionTransformer(func=np.log10)
ft.fit_transform(mat)

array([[0.        , 0.        ],
       [0.30103   , 0.47712125],
       [1.        , 2.        ]])

In [71]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None, names=['sepal length', 'sepal width', 'petal length', 'petal width','label'])
df.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [75]:
df.shape

(150, 5)

In [77]:
y = df['label']
X = df.drop(columns=['label'])

In [72]:
from sklearn.preprocessing import PolynomialFeatures

In [74]:
poly = PolynomialFeatures(degree=2)

In [79]:
Z = poly.fit_transform(X)

In [81]:
X.shape, Z.shape

((150, 4), (150, 15))

In [87]:
from sklearn.datasets import fetch_california_housing
#from sklearn.preprocessing import 
from sklearn.feature_selection import SelectPercentile, mutual_info_regression

In [90]:
X, y = fetch_california_housing(return_X_y=True)

In [91]:
X.shape

(20640, 8)

In [92]:
X_new = SelectPercentile(mutual_info_regression,
                         percentile=10).fit_transform(X, y)

In [93]:
X_new.shape

(20640, 1)

In [94]:
from sklearn.model_selection import train_test_split

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [99]:
X_train.shape

(16512, 8)

In [100]:
from sklearn.metrics import r2_score
from sklearn.dummy import DummyRegressor

In [101]:
dr = DummyRegressor(strategy='mean')
dr.fit(X_train, y_train)
y_pred = dr.predict(X_test)

In [103]:
r2_score(y_test, y_pred)

-0.0014734336890012134

In [106]:
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error

In [107]:
sgd = SGDRegressor()
mm = MinMaxScaler()
X_train_mm = mm.fit_transform(X_train)
X_test_mm = mm.transform(X_test)

In [108]:
sgd.fit(X_train_mm, y_train)

In [113]:
y_pred = sgd.predict(X_train_mm)
mean_absolute_error(y_train, y_pred)

0.5622015005651495

In [114]:
y_pred = sgd.predict(X_test_mm)
mean_absolute_error(y_test, y_pred)

0.560373429893628

In [115]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

In [118]:
kf = KFold(n_splits=5)
lr = LinearRegression()
for train_ind, test_ind in kf.split(X):
  X_train, X_test = X[train_ind], X[test_ind]
  y_train, y_test = y[train_ind], y[test_ind]
  lr.fit(X_train, y_train)
  print(lr.score(X_test, y_test))

0.5486632333951393
0.46820690860543923
0.5507843423339036
0.5369870266519265
0.6605140591532079


In [119]:
from sklearn.datasets import load_diabetes
X, y = load_diabetes(return_X_y=True)

In [120]:
X.shape

(442, 10)

In [124]:
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [127]:
ss = StandardScaler()
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

lr.fit(X_train_ss, y_train)
y_pred = lr.predict(X_test_ss)

In [128]:
r2_score(y_test, y_pred)

0.5341962544929233

In [129]:
lr.coef_

array([ -0.18524015, -10.74395358,  23.97068421,  15.44649018,
       -33.63629412,  19.07747445,   3.87751116,  10.51831194,
        34.15440847,   1.91362494])

In [134]:
load_diabetes().feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [136]:
from sklearn.linear_model import RidgeCV

In [140]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [141]:
ss = StandardScaler()
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

In [142]:
clf = RidgeCV(alphas=np.logspace(-4,0,num=20), cv=5, scoring='neg_mean_absolute_error')
clf.fit(X_train_ss, y_train)

RidgeCV(alphas=array([1.00000000e-04, 1.62377674e-04, 2.63665090e-04, 4.28133240e-04,
       6.95192796e-04, 1.12883789e-03, 1.83298071e-03, 2.97635144e-03,
       4.83293024e-03, 7.84759970e-03, 1.27427499e-02, 2.06913808e-02,
       3.35981829e-02, 5.45559478e-02, 8.85866790e-02, 1.43844989e-01,
       2.33572147e-01, 3.79269019e-01, 6.15848211e-01, 1.00000000e+00]),
        cv=5, scoring='neg_mean_absolute_error')

In [143]:
clf.best_score_

-43.965198190285555