#### Install required libraries

In [None]:
! pip uninstall zoish -y
! pip install git+https://github.com/TorkamaniLab/zoish.git
! pip install  scikit-learn numpy lightgbm==4.0.0  nest-asyncio==1.5.7 plotly==5.16.1 pandas xgboost lohrasb ray xgboost feature-engine hyperopt ipywidgets  argcomplete==3.1.1 catboost==1.2.1 --force-reinstall


#### Imports


In [None]:
# Built-in libraries
import logging
import sys  # Added sys for version logging

# Third-party libraries for data manipulation and computation
import pandas as pd
import numpy  # Added numpy for version logging

# Scikit-learn libraries for modeling, metrics, and data transformation
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import sklearn  

# XGBoost for machine learning modeling
import xgboost  # Added for version logging
from xgboost import XGBRegressor

# Feature Engine for imputation
from feature_engine.imputation import MeanMedianImputer

# Custom libraries for feature selection
from zoish.feature_selectors.shap_selectors import ShapFeatureSelector, ShapPlotFeatures
import zoish  # Added for version logging

# Ray for distributed computing and hyperparameter tuning
import ray  # Added for version logging
from ray import tune, air
from ray.tune.search.hyperopt import HyperOptSearch

# Import lohrasb
import lohrasb 

# Custom libraries for logging and model building
from lohrasb.best_estimator import BaseModel
from zoish import logger  # Custom logging

# IPython settings for better notebook interactivity
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Set logging level
logger.setLevel(logging.ERROR)

# Log versions of key libraries for debugging and documentation
print(f'Python version : {sys.version}')
print(f'zoish version : {zoish.__version__}')
print(f'sklearn version : {sklearn.__version__}')
print(f'pandas version : {pd.__version__}')
print(f'numpy version : {numpy.__version__}')
print(f'xgboost version : {xgboost.__version__}')
print(f'ray version : {ray.__version__}')
print(f'lohrasb version : {lohrasb.__version__}')

In [None]:

# Define the dataset URL
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/telemonitoring/parkinsons_updrs.data"

# Load the data
# The Parkinsons Telemonitoring dataset from UCI ML Repository has biomedical voice measurements 
# from 42 people with early-stage Parkinson's disease. The goal is to use these measurements 
# to predict the UPDRS (Unified Parkinson's Disease Rating Scale) score, which is a widely used
# clinical scale for the disease symptoms. Higher scores represent more severe symptoms.
df = pd.read_csv(data_url)

# Define the target variable - motor_UPDRS column which is a clinician's motor score 
y = df["motor_UPDRS"]

# Define the feature set - all the other columns in the dataset
X = df.drop(["subject#","motor_UPDRS", "total_UPDRS","test_time"], axis=1)  # we remove "total_UPDRS" because it's another target column

# Split the data into a training set and a test set
# 80% of the data will be used for training, and 20% will be used for testing the model.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42)


In [None]:
df.head()

In [None]:
float_cols =  X_train.select_dtypes(include=['float']).columns.tolist()

estimator = XGBRegressor()
# Define the search space
param_space = {
                "max_depth": tune.randint(5, 7),
                "gamma" : tune.uniform(0.01,2),
                 }
# create search algorithm, check main documentation of Tune at https://docs.ray.io/en/latest/tune/api/suggestion.html
search_alg = HyperOptSearch()

# define kwargs for base model
kwargs = {  
    'kwargs':{# params for fit method  
            'fit_tune_kwargs' :{
            'sample_weight':None,
            },
            # params for TuneCV
            'main_tune_kwargs' : {
            'cv':3,
            'scoring':'r2',
            'estimator':estimator,
            },
            # kwargs of Tuner 
            'tuner_kwargs':{
                'tune_config':tune.TuneConfig(
                                    search_alg=search_alg,
                                    mode='max',
                                    metric='score',

                                ),
                'param_space':param_space,
                'run_config':air.RunConfig(stop={"training_iteration": 20}),
            
            },}
    
}

obj = BaseModel().optimize_by_tune(
    **kwargs
        )


pipeline =Pipeline([
            # int missing values imputers
            ('floatimputer', MeanMedianImputer(
                imputation_method='mean', variables=float_cols)),
           


 ])


pipeline.fit_transform(X_train,y_train)
obj.fit(X_train,y_train)

In [None]:

# Feature Selection using ShapFeatureSelector 
estimator_for_feature_selector = obj.best_estimator
shap_feature_selector = ShapFeatureSelector(model=estimator_for_feature_selector, num_features=5, scoring='r2', direction='maximum', n_iter=10, cv=5, algorithm = 'auto')

# Regressor model
regressor = RandomForestRegressor()

# Create a pipeline
pipeline = Pipeline(steps=[
    ('feature_selection', shap_feature_selector),
    ('regressor', regressor)
    ]
    )

# Fit the model
pipeline.fit(X_train, y_train)

# Predict on test data
y_test_pred = pipeline.predict(X_test)

# Output first 10 predictions
print(y_test_pred[:10])


#### Check performance of the Pipeline


In [None]:
print('r2 score : ')
print(r2_score(y_test,y_test_pred))



#### Plots the feature importance


In [None]:
plot_factory = ShapPlotFeatures(shap_feature_selector,type_of_plot='bar_plot')
plot_factory.bar_plot()


In [None]:
plot_factory.summary_plot()

In [None]:
plot_factory.summary_plot_full()