In [1]:
!pip install -U bayesian-optimization



In [2]:
!pip install scikit-optimize



In [3]:
pip install --upgrade scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas import read_excel
import numpy as num
import scipy.stats
from sklearn.datasets import make_circles
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from skopt import BayesSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix

In [5]:
df = pd.read_csv('data2.csv', na_values='?')

In [6]:
# Select the feature columns
feature_columns = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'Jaundice', 'Family_mem_with_ASD']

# Select the target column
target_column = 'Class/ASD Traits '

In [7]:
# Extract the features and target
features = df[feature_columns]
target = df[target_column]

In [8]:
# Convert 'Jaundice' and 'Family_mem_with_ASD' columns from string to integer
label_encoder = LabelEncoder()
features_copy = features.copy()
features_copy['Jaundice'] = label_encoder.fit_transform(features_copy['Jaundice'])
features_copy['Family_mem_with_ASD'] = label_encoder.fit_transform(features_copy['Family_mem_with_ASD'])

# Convert 'Class/ASD Traits' column from string to integer
target = label_encoder.fit_transform(target)


In [9]:
print("features:")
print(features_copy)

features:
      A1  A2  A3  A4  A5  A6  A7  A8  A9  A10  Jaundice  Family_mem_with_ASD
0      0   0   0   0   0   0   1   1   0    1         1                    0
1      1   1   0   0   0   1   1   0   0    0         1                    0
2      1   0   0   0   0   0   1   1   0    1         1                    0
3      1   1   1   1   1   1   1   1   1    1         0                    0
4      1   1   0   1   1   1   1   1   1    1         0                    1
...   ..  ..  ..  ..  ..  ..  ..  ..  ..  ...       ...                  ...
1049   0   0   0   0   0   0   0   0   0    1         0                    1
1050   0   0   1   1   1   0   1   0   1    0         1                    0
1051   1   0   1   1   1   1   1   1   1    1         1                    0
1052   1   0   0   0   0   0   0   1   0    1         0                    1
1053   1   1   0   0   1   1   0   1   1    0         1                    1

[1054 rows x 12 columns]


In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_copy, target, test_size=0.3, random_state=42)

In [11]:
# Define the random forest classifier model
rf = RandomForestClassifier(random_state=42)

In [12]:
rf.fit(X_train,y_train)

In [13]:
pred_rf= rf.predict(X_test)
print(confusion_matrix(y_test,pred_rf))
print(classification_report(y_test,pred_rf))

[[ 93   5]
 [  4 215]]
              precision    recall  f1-score   support

           0       0.96      0.95      0.95        98
           1       0.98      0.98      0.98       219

    accuracy                           0.97       317
   macro avg       0.97      0.97      0.97       317
weighted avg       0.97      0.97      0.97       317



# Define parameter grids or search spaces for Random Search, Grid Search, and Bayesian Optimization

In [14]:
# Define parameter grids or search spaces for Random Search, Grid Search, and Bayesian Optimization
param_grid_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10, 15],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ['sqrt', 'log2'],
    "bootstrap": [True, False],
    "n_jobs": [-1],
    "criterion": ["gini", "entropy"],
    "verbose": [0]
}

# Define the parameter distributions for Randomized Search
param_grid_random = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10, 15],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ['sqrt', 'log2'],
    "bootstrap": [True, False],
    "n_jobs": [-1],
    "criterion": ["gini", "entropy"],
    "verbose": [0]
}

# Define the search space for Bayesian Optimization
param_grid_bayes = {
    'n_estimators': (100, 300),
    'max_depth': (5, 15),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 4),
    'max_features': (0.1, 0.9),
    'bootstrap': [True, False],
    "criterion": ["gini", "entropy"],
    "verbose": [0]
}

Random Search

In [15]:
random_search = RandomizedSearchCV(RandomForestClassifier(), param_grid_random, n_iter=100)
random_search.fit(X_train, y_train)
random_search_best_params = random_search.best_params_

In [16]:
print("Random Search Best Parameters:", random_search_best_params)

Random Search Best Parameters: {'verbose': 0, 'n_jobs': -1, 'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10, 'criterion': 'entropy', 'bootstrap': True}


In [17]:
# Create a DataFrame to display the best parameters
best_params_df = pd.DataFrame.from_dict(random_search_best_params, orient='index', columns=['Best Parameter'])

# Display the DataFrame
print(best_params_df)

                  Best Parameter
verbose                        0
n_jobs                        -1
n_estimators                 100
min_samples_split              5
min_samples_leaf               2
max_features                sqrt
max_depth                     10
criterion                entropy
bootstrap                   True


Grid Search

In [27]:
grid_search = GridSearchCV(RandomForestClassifier(), param_grid_grid)
grid_search.fit(X_train, y_train)
grid_search_best_params = grid_search.best_params_

In [28]:
print("Grid Search Best Parameters:", grid_search_best_params)

Grid Search Best Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100, 'n_jobs': -1, 'verbose': 0}


Bayesian Optimization

In [None]:
#bayesian_search = BayesSearchCV(RandomForestClassifier(), param_grid_bayes, n_iter=50)
#bayesian_search.fit(X_train, y_train)
#bayesian_search_best_params = bayesian_search.best_params_

In [None]:
#print("Bayesian Optimization Best Parameters:", bayesian_search_best_params)

In [29]:
# Evaluate the tuned models on the test set
random_search_pred = random_search.predict(X_test)
grid_search_pred = grid_search.predict(X_test)
#bayes_search_pred = bayesian_search.predict(X_test)

In [None]:
# Print classification reports
print("Random Search Results:")
print(classification_report(y_test, random_search_pred))

Random Search Results:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        98
           1       0.98      0.99      0.98       219

    accuracy                           0.98       317
   macro avg       0.98      0.97      0.97       317
weighted avg       0.98      0.98      0.98       317



In [None]:
print("Grid Search Results:")
print(classification_report(y_test, grid_search_pred))

Grid Search Results:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        98
           1       0.98      0.99      0.98       219

    accuracy                           0.98       317
   macro avg       0.98      0.97      0.97       317
weighted avg       0.98      0.98      0.98       317



In [None]:
#print("Bayesian Optimization Results:")
#print(classification_report(y_test, bayes_search_pred))

Bayesian Optimization Results:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        98
           1       0.98      1.00      0.99       219

    accuracy                           0.99       317
   macro avg       0.99      0.98      0.99       317
weighted avg       0.99      0.99      0.99       317



In [30]:
# Get the best hyperparameters from each method
best_params_random = random_search.best_params_
best_params_grid = grid_search.best_params_
#best_params_bayes = bayesian_search.best_params_

In [31]:
# Train and evaluate the model using the best hyperparameters
rf_best = RandomForestClassifier(**best_params_random)
rf_best.fit(X_train, y_train)
y_pred = rf_best.predict(X_test)

In [32]:
import pickle

In [33]:
pickle.dump(y_pred, open("model3.pkl", 'wb'))

In [34]:
model = pickle.load(open("model.pkl", "rb"))

In [35]:
from joblib import dump

# Save the best model
dump(rf_best, "model.joblib")

['model.joblib']

In [None]:
# Evaluate the model's performance
y_pred = rf_best.predict(X_test)
print(confusion_matrix(y_test, y_pred))
report = classification_report(y_test, y_pred)
print(report)

[[ 92   6]
 [  2 217]]
              precision    recall  f1-score   support

           0       0.98      0.94      0.96        98
           1       0.97      0.99      0.98       219

    accuracy                           0.97       317
   macro avg       0.98      0.96      0.97       317
weighted avg       0.97      0.97      0.97       317

