### Data is the pancreatic cancer biometric data set that I have previously used


In [1]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import set_config
set_config(display="diagram")


import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px

In [2]:
raw_data = pd.read_csv('data/Debernardi et al 2020 data.csv')
raw_data.head()


Unnamed: 0,sample_id,patient_cohort,sample_origin,age,sex,diagnosis,stage,benign_sample_diagnosis,plasma_CA19_9,creatinine,LYVE1,REG1B,TFF1,REG1A
0,S1,Cohort1,BPTB,33,F,1,,,11.7,1.83222,0.893219,52.94884,654.282174,1262.0
1,S10,Cohort1,BPTB,81,F,1,,,,0.97266,2.037585,94.46703,209.48825,228.407
2,S100,Cohort2,BPTB,51,M,1,,,7.0,0.78039,0.145589,102.366,461.141,
3,S101,Cohort2,BPTB,61,M,1,,,8.0,0.70122,0.002805,60.579,142.95,
4,S102,Cohort2,BPTB,62,M,1,,,9.0,0.21489,0.00086,65.54,41.088,


In [3]:
clean1 = raw_data[['age','diagnosis','creatinine','LYVE1','REG1B','TFF1']]
clean1 = clean1.dropna()
clean1.head()

Unnamed: 0,age,diagnosis,creatinine,LYVE1,REG1B,TFF1
0,33,1,1.83222,0.893219,52.94884,654.282174
1,81,1,0.97266,2.037585,94.46703,209.48825
2,51,1,0.78039,0.145589,102.366,461.141
3,61,1,0.70122,0.002805,60.579,142.95
4,62,1,0.21489,0.00086,65.54,41.088


In [11]:
clean1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590 entries, 0 to 589
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         590 non-null    int64  
 1   diagnosis   590 non-null    int64  
 2   creatinine  590 non-null    float64
 3   LYVE1       590 non-null    float64
 4   REG1B       590 non-null    float64
 5   TFF1        590 non-null    float64
dtypes: float64(4), int64(2)
memory usage: 27.8 KB


In [4]:
#generate train/test data for auto
X = clean1.drop(['diagnosis'], axis = 1)
y = clean1['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

#### Lasso Method

In [5]:
pipe = Pipeline([('polyfeatures', PolynomialFeatures(degree = 3, include_bias = False)),
                      #('scaler', StandardScaler()),
                     ('lasso', Lasso(random_state = 42))])
pipe.fit(X_train, y_train)
lasso_coefs = pipe.named_steps['lasso'].coef_
pipe

  model = cd_fast.enet_coordinate_descent(


In [6]:
lasso_train_mse = mean_squared_error(y_train, pipe.predict(X_train))
lasso_test_mse = mean_squared_error(y_test, pipe.predict(X_test))
print(lasso_train_mse)
print(lasso_test_mse)

0.3474266606780669
0.7841167973429988


#### Sequential Features 

In [7]:
feature_names = pipe.named_steps['polyfeatures'].get_feature_names_out()
lasso_df = pd.DataFrame({'feature': feature_names, 'coef': lasso_coefs})
#print(lasso_df)

In [8]:
sequential_pipe = Pipeline([('poly_features', PolynomialFeatures(degree = 3, include_bias = False)),
                           ('selector', SequentialFeatureSelector(LinearRegression(), 
                                                                  n_features_to_select=6)),
                           ('linreg', LinearRegression())])
sequential_pipe.fit(X_train, y_train)
sequential_train_mse = mean_squared_error(y_train, sequential_pipe.predict(X_train))
sequential_test_mse = mean_squared_error(y_test, sequential_pipe.predict(X_test))

print(sequential_train_mse)
print(sequential_test_mse)

0.4222630595980989
0.39204428836806476


#### Grid Search CV

In [13]:
grid_pipe = Pipeline([('scale', StandardScaler()), ('ridge', Ridge())])
#grid_pipe = Pipeline([('ridge', Ridge())])
param_dict = {'ridge__alpha': [0.001, 0.1, 1.0, 10.0, 100.0, 1000.0]}

In [14]:


grid = GridSearchCV(grid_pipe, param_grid=param_dict, scoring="neg_mean_squared_error", cv=5)
grid.fit(X_train, y_train)
train_preds = grid.predict(X_train)
test_preds = grid.predict(X_test)
model_1_train_mse = mean_squared_error(y_train, train_preds)
model_1_test_mse = mean_squared_error(y_test, test_preds)
model_1_best_alpha = grid.best_params_

print(model_1_train_mse)
print(model_1_test_mse)
print(model_1_best_alpha)

0.436875398238926
0.41666697084193444
{'ridge__alpha': 10.0}


In [15]:
model_2_train_mse = ''
model_2_test_mse = ''
model_2_best_alpha = ''
train_preds2 = ''
test_preds2 = ''

grid_2 = GridSearchCV(estimator=grid_pipe, param_grid=param_dict, scoring="neg_mean_squared_error", cv=10)
grid_2.fit(X_train, y_train)
train_preds2 = grid_2.predict(X_train)
test_preds2 = grid_2.predict(X_test)
model_2_train_mse = mean_squared_error(y_train, train_preds2)
model_2_test_mse = mean_squared_error(y_test, test_preds2)
model_2_best_alpha = grid_2.best_params_

print(model_2_train_mse)
print(model_2_test_mse)
print(model_2_best_alpha)

0.436875398238926
0.41666697084193444
{'ridge__alpha': 10.0}


In [16]:


grid_3 = GridSearchCV(grid_pipe, param_grid=param_dict, scoring="neg_mean_squared_error", cv=413)
grid_3.fit(X_train, y_train)
train_preds3 = grid_3.predict(X_train)
test_preds3 = grid_3.predict(X_test)
model_3_train_mse = mean_squared_error(y_train, train_preds3)
model_3_test_mse = mean_squared_error(y_test, test_preds3)
model_3_best_alpha = grid_3.best_params_

print(model_3_train_mse)
print(model_3_test_mse)
print(model_3_best_alpha)

0.436875398238926
0.41666697084193444
{'ridge__alpha': 10.0}
