In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from hyperopt import fmin, tpe, hp
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [2]:
# 1. Reading data from CSV
def read_csv(file_path):
    return pd.read_csv(file_path)

# 2. Creating features
def create_features(data):
    # Sepel Area and Petal Area
    data['sapel_area'] = data['sepal length (cm)'] * data['sepal width (cm)']
    data['petal_area'] = data['petal length (cm)'] * data['petal width (cm)']
    return data

# 3. Training a classifier model
def train_classifier(data):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return model, accuracy

# 4. Hyperparameter tuning with Hyperopt
def objective(params):
    model = RandomForestClassifier(**params)
    score = cross_val_score(model, X, y, cv=5).mean()
    return -score  # Minimize negative accuracy

# 5. Evaluating the model on the test set
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [3]:
from sklearn.datasets import load_iris
import numpy as np
data = load_iris(as_frame = True)
df = pd.DataFrame(np.hstack((data.data, np.array(data.target).reshape(-1, 1))), columns = [col for col in data.data.columns] + ['target'])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [4]:
# Split data into features and target
X = df.drop('target', axis=1)
y = df['target']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), X_train.columns)
        ],
        remainder='passthrough'
    )),
    ('classifier', RandomForestClassifier())
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on test set: {accuracy}")

# Hyperparameter tuning using Tree of Parzen Estimators (TPE)
space = {
    'n_estimators': hp.choice('n_estimators', range(10, 101)),
    'max_depth': hp.choice('max_depth', range(1, 21))
}

best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100)
# algo: The algorithm used for optimization; in this case, it uses tpe.suggest, which is a Bayesian optimization algorithm.

Model accuracy on test set: 1.0
100%|█████████████████████████████████████████████| 100/100 [01:47<00:00,  1.08s/trial, best loss: -0.9666666666666668]


In [5]:
best_params

{'max_depth': 4, 'n_estimators': 67}

In [6]:
# Update the pipeline with the best hyperparameters
pipeline.set_params(classifier__n_estimators = best_params['n_estimators'], classifier__max_depth = best_params['max_depth'])

# Train the model with the best hyperparameters
pipeline.fit(X_train, y_train)

# Evaluate the updated model
evaluate_model(pipeline, X_test, y_test)

1.0

In [7]:
pipeline.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('num', StandardScaler(),
                                    Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
          'petal width (cm)'],
         dtype='object'))])),
  ('classifier', RandomForestClassifier(max_depth=4, n_estimators=67))],
 'verbose': False,
 'preprocessor': ColumnTransformer(remainder='passthrough',
                   transformers=[('num', StandardScaler(),
                                  Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
        'petal width (cm)'],
       dtype='object'))]),
 'classifier': RandomForestClassifier(max_depth=4, n_estimators=67),
 'preprocessor__n_jobs': None,
 'preprocessor__remainder': 'passthrough',
 'preprocessor__sparse_threshold': 0.3,
 'preprocessor__transformer_weights': None,
 'preprocessor__transformers': [('num',
   StandardScaler(),
   Index(['sepal length 