In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [26]:
data = pd.read_csv('data/TP_1_prostate_dataset.txt', sep='\t')

In [27]:
data.head()

Unnamed: 0,col,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45,lpsa,train
0,1,-0.579818,2.769459,50,-1.386294,0,-1.386294,6,0,-0.430783,T
1,2,-0.994252,3.319626,58,-1.386294,0,-1.386294,6,0,-0.162519,T
2,3,-0.510826,2.691243,74,-1.386294,0,-1.386294,7,20,-0.162519,T
3,4,-1.203973,3.282789,58,-1.386294,0,-1.386294,6,0,-0.162519,T
4,5,0.751416,3.432373,62,-1.386294,0,-1.386294,6,0,0.371564,T


## Variables selection

In [28]:
X = data.iloc[:,1:-2]

In [29]:
y = data.iloc[:,-2]

## Variables standardization

In [30]:
from sklearn.preprocessing import StandardScaler

In [31]:
X_std = StandardScaler().fit_transform(X)

## Split the dataset

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.25, random_state=0)

In [34]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((72, 8), (25, 8), (72,), (25,))

# Train baseline model for baseline metrics

In [35]:
from sklearn import dummy, metrics 

strategies = ['mean', 'median', 'quantile', 'constant']
quantile_value, constant_value = 0.2, 1
dummy_rmse = 10000.0

for strategy in strategies:
    dum_reg = dummy.DummyRegressor(strategy=strategy, constant=constant_value, quantile=quantile_value)
    dum_reg.fit(X_train, y_train)

    d_R2 = dum_reg.score(X_test, y_test)
    y_pred = dum_reg.predict(X_test)
    d_rmse = metrics.mean_squared_error(y_test, y_pred, squared=False)
    print(f"strategy: {strategy.ljust(8)} -> R2={d_R2:.3f} | RMSE={d_rmse:.3f}")
    
    if d_rmse < dummy_rmse:
        dummy_rmse = d_rmse

strategy: mean     -> R2=-0.069 | RMSE=1.133
strategy: median   -> R2=-0.089 | RMSE=1.144
strategy: quantile -> R2=-0.323 | RMSE=1.260
strategy: constant -> R2=-1.333 | RMSE=1.674



# SVM Regression

In [12]:
from sklearn.svm import LinearSVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [13]:
#clf = make_pipeline(StandardScaler(), LinearSVR(random_state=0, tol=1e-5))
#clf.fit(X_train, y_train)
clf = LinearSVR(random_state=0, tol=1e-5)
clf.fit(X_train, y_train)

LinearSVR(random_state=0, tol=1e-05)

In [14]:
#print(clf.named_steps['linearsvr'].coef_)
#print(clf.named_steps['linearsvr'].intercept_)
print(clf.coef_)
print(clf.intercept_)
print(clf.predict([[0, 0, 0, 0, 0, 0, 0, 0]]))
print(clf.score(X_test, y_test))

[ 0.63026892  0.26678746 -0.21064226  0.20398833  0.31403647 -0.20157513
  0.17497285  0.07111211]
[2.39662503]
[2.39662503]
0.6371863756082184


## SVM Regression with pipeline

In [15]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
X, y = make_classification(n_features=4, random_state=0)
clf = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))
clf.fit(X, y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearsvc', LinearSVC(random_state=0, tol=1e-05))])

In [16]:
print(clf.named_steps['linearsvc'].coef_)
print(clf.named_steps['linearsvc'].intercept_)
print(clf.predict([[0, 0, 0, 0]]))

[[0.14144316 0.52678399 0.67978685 0.49307524]]
[0.16935939]
[1]


## SVM Classification

In [17]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
#X, y = make_classification(n_features=4, random_state=0)

X_std = StandardScaler().fit_transform(X)
clf = LinearSVC(random_state=0, tol=1e-5)
#clf = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))
clf.fit(X_std, y)

LinearSVC(random_state=0, tol=1e-05)

In [18]:
print(clf.coef_)

print(clf.intercept_)
print(clf.predict([[0, 0, 0, 0]]))

[[0.14144316 0.52678399 0.67978685 0.49307524]]
[0.16935939]
[1]


In [19]:
print(f"accuracy: {clf.score(X_std, y)}")

0.93

## SVM Classification with pipeline

In [20]:
from sklearn.svm import LinearSVR
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_regression
X, y = make_regression(n_features=4, random_state=0)

X_std = StandardScaler().fit_transform(X)
clf = LinearSVR(random_state=0, tol=1e-5)
#clf = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))
clf.fit(X_std, y)

LinearSVR(random_state=0, tol=1e-05)

In [21]:
print(clf.coef_)

print(clf.intercept_)
print(clf.predict([[0, 1, 0, 0]]))

[18.58284419 27.02366194 44.35792369 64.52256231]
[-4.]
[23.02366194]


In [25]:
print(f"accuracy: {clf.score(X_std, y)}")

accuracy: 0.942726760699926
