In [2]:
import pandas as pd
data = pd.read_csv("datasets/prostate_cancer.txt")
data.head(5)

Unnamed: 0,id,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45,lpsa,train
0,1,-0.579818,2.769459,50,-1.386294,0,-1.386294,6,0,-0.430783,T
1,2,-0.994252,3.319626,58,-1.386294,0,-1.386294,6,0,-0.162519,T
2,3,-0.510826,2.691243,74,-1.386294,0,-1.386294,7,20,-0.162519,T
3,4,-1.203973,3.282789,58,-1.386294,0,-1.386294,6,0,-0.162519,T
4,5,0.751416,3.432373,62,-1.386294,0,-1.386294,6,0,0.371564,T


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97 entries, 0 to 96
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       97 non-null     int64  
 1   lcavol   97 non-null     float64
 2   lweight  97 non-null     float64
 3   age      97 non-null     int64  
 4   lbph     97 non-null     float64
 5   svi      97 non-null     int64  
 6   lcp      97 non-null     float64
 7   gleason  97 non-null     int64  
 8   pgg45    97 non-null     int64  
 9   lpsa     97 non-null     float64
 10  train    97 non-null     object 
dtypes: float64(5), int64(5), object(1)
memory usage: 8.5+ KB


The data information 

The data for this example come from a study by Stamey et al. (1989) that examined the correlation between the level of prostate specific antigen (PSA) and a number of clinical measures, in 97 men who were about to receive a radical prostatectomy. The goal is to predict the log of PSA (lpsa) from a number of measurements.

Variables:

lpsa	    log prostate specific antigen
lcavol	    log cancer volume
lweight	    log prostate weight
age	        age
lbph	    log of benign prostatic hyperplasia amount
svi	        seminal vesicle invasion
lcp	        log of capsular penetration
gleason	    Gleason score
pgg45	    percent of Gleason scores 4 or 5

This is a supervised learning problem, known as a regression problem, because the outcome measurement is quantitative.

In [23]:
X = data.drop(['id', 'lpsa', 'train'], axis='columns') 
y = data['lpsa']  # Target variable


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55)
print(X_train.shape, X_test.shape)


(77, 8) (20, 8)


In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nMean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")


Mean Squared Error: 0.7207312251491266
Mean Absolute Error: 0.6795973961291044
R-squared: 0.6470740302297486


In [28]:
# accuracy depends on the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape)

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nMean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

(67, 8) (30, 8)

Mean Squared Error: 0.4612959596185864
Mean Absolute Error: 0.530383981112604
R-squared: 0.6039185108581981


In [29]:
df = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
df

Unnamed: 0,Coefficient
lcavol,0.641959
lweight,0.493016
age,-0.025925
lbph,0.079192
svi,0.870546
lcp,-0.171944
gleason,0.092353
pgg45,0.0047


In [31]:
from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha=10)
ridge_model.fit(X_train, y_train)

# ridge_model = Ridge(alpha=10).fit(X_train, y_train)  # inline possible


# Make predictions on the test set
y_pred = ridge_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nMean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")


Mean Squared Error: 0.4451186228892906
Mean Absolute Error: 0.49290134977139155
R-squared: 0.6178088202972527


### - Data preprocessing: normalization

x_scaled = (x − x_min)/(x_max − x_min)

In [33]:
# data preprocessing: normalization

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nMean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")


Mean Squared Error: 0.46129595961858627
Mean Absolute Error: 0.5303839811126044
R-squared: 0.6039185108581981


In [35]:

# Initialize and train the linear regression model
ridge_model = Ridge()
ridge_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = ridge_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nMean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")


Mean Squared Error: 0.4158121965885847
Mean Absolute Error: 0.5015838360115886
R-squared: 0.6429721297270723


### Data preprocessing - standardization
x_scaled = (x − mean)/std

In [36]:
# data preprocessing: normalization

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nMean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")


Mean Squared Error: 0.46129595961858577
Mean Absolute Error: 0.5303839811126038
R-squared: 0.6039185108581986


In [38]:
# Initialize and train the linear regression model
ridge_model = Ridge()
ridge_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = ridge_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nMean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")


Mean Squared Error: 0.45217643739611035
Mean Absolute Error: 0.5248240592232962
R-squared: 0.6117487852554129


### Ridge alpha = 1, 10, 100...
#### hyperparameter tuning

In [44]:
# Initialize and train the linear regression model
ridge_model = Ridge(alpha=10) 
ridge_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = ridge_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nMean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")


Mean Squared Error: 0.4106297587096994
Mean Absolute Error: 0.4973852781016804
R-squared: 0.6474219144469537


### Lasso regularization

In [50]:
# Initialize and train the linear regression model
from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha=0.01) 
lasso_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = lasso_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nMean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")


Mean Squared Error: 0.448602397560171
Mean Absolute Error: 0.5390752892225718
R-squared: 0.6148175548618959


### Hammasini birlashtiramiz

In [2]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


data = pd.read_csv("datasets/prostate_cancer.txt")

X = data.drop(['id', 'lpsa', 'train'], axis='columns') 
y = data['lpsa']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)


# data preprocessing: normalization
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the linear regression model
ridge_model = Ridge(alpha=10) 
ridge_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = ridge_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nMean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

(77, 8) (20, 8)

Mean Squared Error: 0.35372260282112006
Mean Absolute Error: 0.43045480521343105
R-squared: 0.7528139307371668
