In [1]:
import pandas as pd
import pyreadr

# Specify the path to the ALS_progression_rate rds file
file_path = 'ALS_progression_rate.1822x370.rds'

# Read the rds file into a pandas dataframe
r = pyreadr.read_r(file_path)
df = r[None]


In [2]:
df.rename(columns={'dFRS': 'response'}, inplace=True)


In [3]:
# Split the data into training set and prediction set
train_set = df[df['response'].notna()]
prediction_set = df[df['response'].isna()]

print(train_set.shape)
print(prediction_set.shape)


(1622, 370)
(200, 370)


In [5]:
# Import the necessary packages
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

X = train_set.drop('response', axis=1)  # X is all columns in the training set except 'response'
y = train_set['response']
# Split your data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare the training data
train_data = lgb.Dataset(X_train, label=y_train)

# Set the parameters for the LightGBM model
params = {
	'objective': 'regression',
	'metric': 'rmse',
	'learning_rate': 0.1,
	'num_leaves': 31,
	'bagging_fraction': 0.8,
	'feature_fraction': 0.8
}

# Train the LightGBM model
model = lgb.train(params, train_data, num_boost_round=100)

# Make predictions on the test data
predictions = model.predict(X_test)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, predictions))

# Print the RMSE
print(rmse)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008897 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28596
[LightGBM] [Info] Number of data points in the train set: 1297, number of used features: 338
[LightGBM] [Info] Start training from score -0.697527
0.5284186966468082


In [6]:
# Import the necessary packages
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np

X = train_set.drop('response', axis=1)  # X is all columns in the training set except 'response'
y = train_set['response']
# Split your data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a LightGBM model
model = lgb.LGBMRegressor()

# Define the grid of hyperparameters to search
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [31, 50, 100],
    'bagging_fraction': [0.5, 0.7, 0.8, 0.9],
    'feature_fraction': [0.5, 0.7, 0.8, 0.9]
}

# Set up the grid search
grid = GridSearchCV(model, param_grid, cv=5, scoring='neg_root_mean_squared_error')

# Fit the model to the data
grid.fit(X_train, y_train)

# Print the best parameters found
print(grid.best_params_)

# Use the best model to make predictions
predictions = grid.best_estimator_.predict(X_test)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, predictions))

# Print the RMSE
print(rmse)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003132 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26709
[LightGBM] [Info] Number of data points in the train set: 1037, number of used features: 336
[LightGBM] [Info] Start training from score -0.685019
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003203 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26761
[LightGBM] [Info] Number of data points in the train set: 1037, number of used features: 335
[LightGBM] [Info] Start training from score -0.706567
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002897 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26636
[LightGBM] [Info] Number of data points in the train set: 1038, number of used features: 335
[LightGBM] [Info] Star

In [7]:
# Import the necessary packages
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np

# Split your data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a SVM model
model = SVR()

# Define the grid of hyperparameters to search
param_grid = {
	'C': [0.1, 1, 10],
	'epsilon': [0.01, 0.1, 1],
	'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

# Set up the grid search
grid = GridSearchCV(model, param_grid, cv=5, scoring='neg_root_mean_squared_error')

# Fit the model to the data
grid.fit(X_train, y_train)

# Print the best parameters found
print(grid.best_params_)

# Use the best model to make predictions
predictions = grid.best_estimator_.predict(X_test)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, predictions))

# Print the RMSE
print(rmse)

{'C': 10, 'epsilon': 0.01, 'kernel': 'rbf'}
0.5372013793982879


In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

# Create a Linear Regression model
model = LinearRegression()

# Fit the model to the data
model.fit(X_train, y_train)

# Use the model to make predictions
predictions = model.predict(X_test)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, predictions))

# Print the RMSE
print(rmse)

0.5888477663633475


In [5]:
from sklearn.model_selection import train_test_split

# Assuming df is your DataFrame, 'response' is your target variable

# Split the data into training set and test set
train, test  = train_test_split(train_set, test_size=0.2, random_state=42)

# Split the data into training+validation set and test set
X = train_set.drop('response', axis=1)
y = train_set['response']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the training+validation set into training and validation sets

pycaret from here

In [None]:
from pycaret.regression import *
s = setup(data = train_set, target ='response',train_size=0.8)

In [None]:
best = compare_models()

In [None]:
predict_model(best, data=prediction_set)

lazypredict here

In [6]:
from lazypredict import LazyRegressor
import numpy as np

In [7]:
reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

 21%|██▏       | 9/42 [00:12<00:56,  1.70s/it]

GammaRegressor model failed to execute
Some value(s) of y are out of the valid range of the loss 'HalfGammaLoss'.


 71%|███████▏  | 30/42 [00:25<00:03,  4.00it/s]

PoissonRegressor model failed to execute
Some value(s) of y are out of the valid range of the loss 'HalfPoissonLoss'.


 98%|█████████▊| 41/42 [00:41<00:00,  1.36it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004112 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28252
[LightGBM] [Info] Number of data points in the train set: 1297, number of used features: 338
[LightGBM] [Info] Start training from score -0.697527


100%|██████████| 42/42 [00:41<00:00,  1.01it/s]


In [8]:
print(models)

                                                      Adjusted R-Squared  \
Model                                                                      
Lars                          373644788190158921494857795071006212096.00   
TransformedTargetRegressor                 15059808942524347258503168.00   
LinearRegression                           15059808942524347258503168.00   
RANSACRegressor                             3221299539381674905174016.00   
SGDRegressor                                                    18011.54   
KernelRidge                                                        18.04   
GaussianProcessRegressor                                           17.17   
MLPRegressor                                                       14.71   
ExtraTreeRegressor                                                 14.70   
PassiveAggressiveRegressor                                         13.23   
DecisionTreeRegressor                                              12.49   
LinearSVR   

and auto-sklearn, guess were going to loonix