In [2]:
import pandas as pd
import pyreadr

# Specify the path to the ALS_progression_rate rds file
file_path = 'ALS_progression_rate.1822x370.rds'

# Read the rds file into a pandas dataframe
r = pyreadr.read_r(file_path)
df = r[None]


In [3]:
import lightgbm as lgb
import numpy as np

In [4]:
df.rename(columns={'dFRS': 'response'}, inplace=True)


In [5]:
# Split the data into training set and prediction set
train_set = df[df['response'].notna()]
prediction_set = df[df['response'].isna()]

print(train_set.shape)
print(prediction_set.shape)


(1622, 370)
(200, 370)


In [6]:
from sklearn.model_selection import train_test_split


# Assuming df is your DataFrame, 'response' is your target variable
X = train_set.drop('response', axis=1)
y = train_set['response']

# Split the data into training+validation set and test set
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the training+validation set into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)




In [7]:
# Create LightGBM datasets
train_dataset = lgb.Dataset(X_train, label=y_train)
valid_dataset = lgb.Dataset(X_valid, label=y_valid)


In [40]:
gridParams = {
	'learning_rate': [ 0.3,0.4,0.5,0.6],
	'n_estimators': [ 200, 300, 400],
	'num_leaves': [31],
	'boosting_type' : ['gbdt']
}

# Initialize best parameters
best_params = {}

# Initialize best score
min_rmse = float('Inf')

# Perform the grid search
for learning_rate in gridParams['learning_rate']:
	for n_estimators in gridParams['n_estimators']:
		for num_leaves in gridParams['num_leaves']:
			params = {'learning_rate': learning_rate, 'n_estimators': n_estimators, 'num_leaves': num_leaves, 'boosting_type': gridParams['boosting_type']}
			cv_results = lgb.cv(params, train_dataset, nfold=5, stratified=False, metrics='l2')
			mean_rmse = np.mean(cv_results['valid l2-mean'])
			if mean_rmse < min_rmse:
				min_rmse = mean_rmse
				best_params = params



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007497 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26188
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 335
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006598 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26188
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 335
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005814 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26188
[LightGBM] [Info] Number of data points in the train set: 776, number of used features: 335
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005723 seconds.
You can set `force_col_wise=t

In [41]:
# Train the model with the best parameters on the training set, using the validation set for early stopping
#best_model = lgb.train(best_params, train_dataset, num_boost_round=1000, valid_sets=[valid_dataset], early_stopping_rounds=10)
custom_best_params = {'learning_rate': 0.001, 'num_iterations': 20000, 'num_leaves': 6000, 'device':'cpu','boosting':'dart'}

best_model = lgb.train(best_params, train_dataset, valid_sets=[valid_dataset], callbacks=[lgb.early_stopping(stopping_rounds=5)])
# Predict on the test data
predictions = best_model.predict(X_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007466 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26188
[LightGBM] [Info] Number of data points in the train set: 972, number of used features: 335
[LightGBM] [Info] Start training from score -0.697553
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[10]	valid_0's l2: 0.307111




In [42]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Calculate the RMSE of the test predictions
rmse = sqrt(mean_squared_error(y_test, predictions))
print("RMSE: ", rmse)

RMSE:  0.536923233166944


In [15]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

# Define the parameter grid
param_grid = {
	'learning_rate': [0.01, 0.1, 1],
	'n_estimators': [20, 40, 60],
	'num_leaves': [31, 61, 91],
}

# Create a LightGBM model
model = lgb.LGBMRegressor()

# Create the grid search object
grid = GridSearchCV(model, param_grid, cv=5, scoring='neg_root_mean_squared_error')

# Fit the grid search object to the data
grid.fit(X_train, y_train)

# Print the best parameters
print("Best parameters: ", grid.best_params_)

# Print the best score
print("Best score: ", grid.best_score_)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003579 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23692
[LightGBM] [Info] Number of data points in the train set: 777, number of used features: 334
[LightGBM] [Info] Start training from score -0.695922
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003538 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23570
[LightGBM] [Info] Number of data points in the train set: 777, number of used features: 332
[LightGBM] [Info] Start training from score -0.693764
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003119 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23853
[LightGBM] [Info] Number of data points in the train set: 778, number of used features: 333
[LightGBM] [Info] Start t

In [16]:


# Train a new model with the best parameters
best_model = lgb.LGBMRegressor(**grid.best_params_)
best_model.fit(X_train, y_train)

# Make predictions on the test set
predictions = best_model.predict(X_test)

# Calculate the RMSE of the test predictions
rmse = sqrt(mean_squared_error(y_test, predictions))
print("Test RMSE: ", rmse)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004683 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26188
[LightGBM] [Info] Number of data points in the train set: 972, number of used features: 335
[LightGBM] [Info] Start training from score -0.697553
Test RMSE:  0.5217656925829479
