# Model Building and Verification

## 1. Modules and Libraries

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')


## 2. Load Data

1. Read the CSV file and display few rows
2. Split the data into features and labels
3. Display the shape of the data

In [2]:
# Read the data
promapen_train = pd.read_csv("datasets/ProMapEn/promapen-train_data_similarities.csv")
promapen_test = pd.read_csv("datasets/ProMapEn/promapen-test_data_similarities.csv")

promapen_train.head()

Unnamed: 0,id1,id2,name_id,name_brand,name_words,name_cos,name_descriptives,name_units,name_numbers,short_description_id,...,all_texts_units,all_texts_numbers,all_units_list,all_ids_list,all_numbers_list,all_brands_list,specification_key_matches,specification_key_value_matches,hash_similarity,match
0,https://walmart.com/ip/Herbal-Secrets-Querceti...,https://www.amazon.com/dp/B01LX6PKB6,0.0,0.0,-0.454545,-0.237299,-0.84,0.0,0.0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1,https://walmart.com/ip/Wrangler-Men-s-Outdoor-...,https://www.amazon.com/dp/B08MY8FFBF,0.0,0.0,0.230769,0.399454,-0.76,0.0,0.0,0.0,...,0,0,0.0,0.0,0.6,0.0,0.0,0.0,0,1
2,https://walmart.com/ip/Tide-Pods-3-in-1-Laundr...,https://www.amazon.com/dp/B01BUNHFQM,0.0,0.0,0.4,0.454827,-0.72,0.0,0.333333,0.0,...,0,0,0.0,0.0,0.846154,0.0,0.0,0.0,0,1
3,https://walmart.com/ip/Nature-s-Nutrition-Turm...,https://www.amazon.com/dp/B07W7N8SLX,0.0,0.0,-0.058824,0.114279,-0.56,0.0,0.0,0.333333,...,0,0,0.714286,0.333333,0.714286,0.0,0.0,0.0,0,0
4,https://walmart.com/ip/Stansport-Enamel-Percol...,https://www.amazon.com/dp/B001DC5HG6,0.0,0.0,0.066667,0.247184,-0.84,0.0,0.0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.5,0.25,0,0


In [3]:
# Split the data into features and labels
X_train_promapen = promapen_train.iloc[:, 2:-1]
y_train_promapen = promapen_train.iloc[:, -1]

X_test_promapen = promapen_test.iloc[:, 2:-1]
y_test_promapen = promapen_test.iloc[:, -1]

print(f"ProMapEn Train data: {X_train_promapen.shape}")
print(f"ProMapEn Train label: {y_train_promapen.shape}")
print(f"ProMapEn Test data: {X_test_promapen.shape}")
print(f"ProMapEn Test label: {y_test_promapen.shape}")


ProMapEn Train data: (1244, 34)
ProMapEn Train label: (1244,)
ProMapEn Test data: (311, 34)
ProMapEn Test label: (311,)


## 3. Grid Search 

In [4]:
class CustomGridSearchCV:
    """Extension of Grid Search to accomodate various models"""
    
    def __init__(self, X_train, y_train, X_test, y_test, classifier, param_grid, n_jobs=None, verbose=10):
        """
        Initialize the models
        Args:
            X_train (pd.DataFrame): train data
            y_train (pd.DataFrame): train labels
            X_test (pd.DataFrame): test data
            y_test (pd.DataFrame): test labels
            classifier (sklearn): machine learningmodel 
            param_grid (dict): grid search parameters
        """
        self.base_model = GridSearchCV
        self.estimator = classifier
        self.param_grid = param_grid
        self.verbose = verbose
        self.n_jobs = n_jobs
        self.best_params = None
        self.best_model = None
        
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        
        self.accuracy = None
        self.f1 = None
        self.precision = None
        self.recall = None
        
    def estimate_best_params(self):
        """
        Estimate the best parameters using grid search
        """
        grid_search = self.base_model(estimator=self.estimator(random_state=42), 
                                      param_grid=self.param_grid,
                                      cv=5, 
                                      scoring="f1",
                                      n_jobs=self.n_jobs,
                                      verbose=self.verbose)
        
        grid_search.fit(self.X_train, self.y_train)
        
        self.best_params = grid_search.best_params_
        
    def build_test_model_with_best_params(self):
        """
        Build the model using best parameters and test 
        """
        self.best_model = self.estimator(**self.best_params, random_state=42)
        self.best_model.fit(self.X_train, self.y_train)
        
        y_pred = self.best_model.predict(self.X_test)
        
        self.accuracy = accuracy_score(self.y_test, y_pred)
        self.f1 = f1_score(self.y_test, y_pred)
        self.precision = precision_score(self.y_test, y_pred)
        self.recall = recall_score(self.y_test, y_pred)
  
        

### 3.4 Random Forest

In [5]:
# Define the hyperparameter grid for grid search
rf_param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [5, 10, 20, 50],
    'min_samples_split': [2, 5, 10, 20],
    'criterion': ["gini", "entropy"]
}


rf_model = CustomGridSearchCV(X_train=X_train_promapen, 
                              y_train=y_train_promapen,
                              X_test=X_test_promapen,
                              y_test=y_test_promapen,
                              classifier=RandomForestClassifier,
                              param_grid=rf_param_grid)

# Estimate best parameters for Random Forest
rf_model.estimate_best_params()


Fitting 5 folds for each of 128 candidates, totalling 640 fits
[CV 1/5; 1/128] START criterion=gini, max_depth=5, min_samples_split=2, n_estimators=50
[CV 1/5; 1/128] END criterion=gini, max_depth=5, min_samples_split=2, n_estimators=50;, score=0.500 total time=   0.1s
[CV 2/5; 1/128] START criterion=gini, max_depth=5, min_samples_split=2, n_estimators=50
[CV 2/5; 1/128] END criterion=gini, max_depth=5, min_samples_split=2, n_estimators=50;, score=0.443 total time=   0.0s
[CV 3/5; 1/128] START criterion=gini, max_depth=5, min_samples_split=2, n_estimators=50
[CV 3/5; 1/128] END criterion=gini, max_depth=5, min_samples_split=2, n_estimators=50;, score=0.358 total time=   0.0s
[CV 4/5; 1/128] START criterion=gini, max_depth=5, min_samples_split=2, n_estimators=50
[CV 4/5; 1/128] END criterion=gini, max_depth=5, min_samples_split=2, n_estimators=50;, score=0.460 total time=   0.0s
[CV 5/5; 1/128] START criterion=gini, max_depth=5, min_samples_split=2, n_estimators=50
[CV 5/5; 1/128] END c

[CV 5/5; 8/128] END criterion=gini, max_depth=5, min_samples_split=5, n_estimators=500;, score=0.483 total time=   0.3s
[CV 1/5; 9/128] START criterion=gini, max_depth=5, min_samples_split=10, n_estimators=50
[CV 1/5; 9/128] END criterion=gini, max_depth=5, min_samples_split=10, n_estimators=50;, score=0.519 total time=   0.0s
[CV 2/5; 9/128] START criterion=gini, max_depth=5, min_samples_split=10, n_estimators=50
[CV 2/5; 9/128] END criterion=gini, max_depth=5, min_samples_split=10, n_estimators=50;, score=0.488 total time=   0.0s
[CV 3/5; 9/128] START criterion=gini, max_depth=5, min_samples_split=10, n_estimators=50
[CV 3/5; 9/128] END criterion=gini, max_depth=5, min_samples_split=10, n_estimators=50;, score=0.370 total time=   0.0s
[CV 4/5; 9/128] START criterion=gini, max_depth=5, min_samples_split=10, n_estimators=50
[CV 4/5; 9/128] END criterion=gini, max_depth=5, min_samples_split=10, n_estimators=50;, score=0.457 total time=   0.0s
[CV 5/5; 9/128] START criterion=gini, max_de

[CV 4/5; 16/128] END criterion=gini, max_depth=5, min_samples_split=20, n_estimators=500;, score=0.444 total time=   0.3s
[CV 5/5; 16/128] START criterion=gini, max_depth=5, min_samples_split=20, n_estimators=500
[CV 5/5; 16/128] END criterion=gini, max_depth=5, min_samples_split=20, n_estimators=500;, score=0.496 total time=   0.3s
[CV 1/5; 17/128] START criterion=gini, max_depth=10, min_samples_split=2, n_estimators=50
[CV 1/5; 17/128] END criterion=gini, max_depth=10, min_samples_split=2, n_estimators=50;, score=0.537 total time=   0.0s
[CV 2/5; 17/128] START criterion=gini, max_depth=10, min_samples_split=2, n_estimators=50
[CV 2/5; 17/128] END criterion=gini, max_depth=10, min_samples_split=2, n_estimators=50;, score=0.508 total time=   0.0s
[CV 3/5; 17/128] START criterion=gini, max_depth=10, min_samples_split=2, n_estimators=50
[CV 3/5; 17/128] END criterion=gini, max_depth=10, min_samples_split=2, n_estimators=50;, score=0.404 total time=   0.0s
[CV 4/5; 17/128] START criterion

[CV 3/5; 24/128] END criterion=gini, max_depth=10, min_samples_split=5, n_estimators=500;, score=0.378 total time=   0.5s
[CV 4/5; 24/128] START criterion=gini, max_depth=10, min_samples_split=5, n_estimators=500
[CV 4/5; 24/128] END criterion=gini, max_depth=10, min_samples_split=5, n_estimators=500;, score=0.476 total time=   0.5s
[CV 5/5; 24/128] START criterion=gini, max_depth=10, min_samples_split=5, n_estimators=500
[CV 5/5; 24/128] END criterion=gini, max_depth=10, min_samples_split=5, n_estimators=500;, score=0.558 total time=   0.5s
[CV 1/5; 25/128] START criterion=gini, max_depth=10, min_samples_split=10, n_estimators=50
[CV 1/5; 25/128] END criterion=gini, max_depth=10, min_samples_split=10, n_estimators=50;, score=0.538 total time=   0.0s
[CV 2/5; 25/128] START criterion=gini, max_depth=10, min_samples_split=10, n_estimators=50
[CV 2/5; 25/128] END criterion=gini, max_depth=10, min_samples_split=10, n_estimators=50;, score=0.492 total time=   0.0s
[CV 3/5; 25/128] START cri

[CV 2/5; 32/128] END criterion=gini, max_depth=10, min_samples_split=20, n_estimators=500;, score=0.515 total time=   0.5s
[CV 3/5; 32/128] START criterion=gini, max_depth=10, min_samples_split=20, n_estimators=500
[CV 3/5; 32/128] END criterion=gini, max_depth=10, min_samples_split=20, n_estimators=500;, score=0.400 total time=   0.5s
[CV 4/5; 32/128] START criterion=gini, max_depth=10, min_samples_split=20, n_estimators=500
[CV 4/5; 32/128] END criterion=gini, max_depth=10, min_samples_split=20, n_estimators=500;, score=0.489 total time=   0.5s
[CV 5/5; 32/128] START criterion=gini, max_depth=10, min_samples_split=20, n_estimators=500
[CV 5/5; 32/128] END criterion=gini, max_depth=10, min_samples_split=20, n_estimators=500;, score=0.535 total time=   0.5s
[CV 1/5; 33/128] START criterion=gini, max_depth=20, min_samples_split=2, n_estimators=50
[CV 1/5; 33/128] END criterion=gini, max_depth=20, min_samples_split=2, n_estimators=50;, score=0.569 total time=   0.1s
[CV 2/5; 33/128] STAR

[CV 1/5; 40/128] END criterion=gini, max_depth=20, min_samples_split=5, n_estimators=500;, score=0.561 total time=   0.5s
[CV 2/5; 40/128] START criterion=gini, max_depth=20, min_samples_split=5, n_estimators=500
[CV 2/5; 40/128] END criterion=gini, max_depth=20, min_samples_split=5, n_estimators=500;, score=0.522 total time=   0.5s
[CV 3/5; 40/128] START criterion=gini, max_depth=20, min_samples_split=5, n_estimators=500
[CV 3/5; 40/128] END criterion=gini, max_depth=20, min_samples_split=5, n_estimators=500;, score=0.431 total time=   0.5s
[CV 4/5; 40/128] START criterion=gini, max_depth=20, min_samples_split=5, n_estimators=500
[CV 4/5; 40/128] END criterion=gini, max_depth=20, min_samples_split=5, n_estimators=500;, score=0.472 total time=   0.5s
[CV 5/5; 40/128] START criterion=gini, max_depth=20, min_samples_split=5, n_estimators=500
[CV 5/5; 40/128] END criterion=gini, max_depth=20, min_samples_split=5, n_estimators=500;, score=0.578 total time=   0.5s
[CV 1/5; 41/128] START cri

[CV 5/5; 47/128] END criterion=gini, max_depth=20, min_samples_split=20, n_estimators=200;, score=0.567 total time=   0.2s
[CV 1/5; 48/128] START criterion=gini, max_depth=20, min_samples_split=20, n_estimators=500
[CV 1/5; 48/128] END criterion=gini, max_depth=20, min_samples_split=20, n_estimators=500;, score=0.554 total time=   0.5s
[CV 2/5; 48/128] START criterion=gini, max_depth=20, min_samples_split=20, n_estimators=500
[CV 2/5; 48/128] END criterion=gini, max_depth=20, min_samples_split=20, n_estimators=500;, score=0.504 total time=   0.5s
[CV 3/5; 48/128] START criterion=gini, max_depth=20, min_samples_split=20, n_estimators=500
[CV 3/5; 48/128] END criterion=gini, max_depth=20, min_samples_split=20, n_estimators=500;, score=0.389 total time=   0.5s
[CV 4/5; 48/128] START criterion=gini, max_depth=20, min_samples_split=20, n_estimators=500
[CV 4/5; 48/128] END criterion=gini, max_depth=20, min_samples_split=20, n_estimators=500;, score=0.492 total time=   0.5s
[CV 5/5; 48/128] 

[CV 4/5; 55/128] END criterion=gini, max_depth=50, min_samples_split=5, n_estimators=200;, score=0.458 total time=   0.2s
[CV 5/5; 55/128] START criterion=gini, max_depth=50, min_samples_split=5, n_estimators=200
[CV 5/5; 55/128] END criterion=gini, max_depth=50, min_samples_split=5, n_estimators=200;, score=0.561 total time=   0.2s
[CV 1/5; 56/128] START criterion=gini, max_depth=50, min_samples_split=5, n_estimators=500
[CV 1/5; 56/128] END criterion=gini, max_depth=50, min_samples_split=5, n_estimators=500;, score=0.565 total time=   0.5s
[CV 2/5; 56/128] START criterion=gini, max_depth=50, min_samples_split=5, n_estimators=500
[CV 2/5; 56/128] END criterion=gini, max_depth=50, min_samples_split=5, n_estimators=500;, score=0.511 total time=   0.5s
[CV 3/5; 56/128] START criterion=gini, max_depth=50, min_samples_split=5, n_estimators=500
[CV 3/5; 56/128] END criterion=gini, max_depth=50, min_samples_split=5, n_estimators=500;, score=0.421 total time=   0.5s
[CV 4/5; 56/128] START cri

[CV 3/5; 63/128] END criterion=gini, max_depth=50, min_samples_split=20, n_estimators=200;, score=0.389 total time=   0.2s
[CV 4/5; 63/128] START criterion=gini, max_depth=50, min_samples_split=20, n_estimators=200
[CV 4/5; 63/128] END criterion=gini, max_depth=50, min_samples_split=20, n_estimators=200;, score=0.473 total time=   0.2s
[CV 5/5; 63/128] START criterion=gini, max_depth=50, min_samples_split=20, n_estimators=200
[CV 5/5; 63/128] END criterion=gini, max_depth=50, min_samples_split=20, n_estimators=200;, score=0.563 total time=   0.2s
[CV 1/5; 64/128] START criterion=gini, max_depth=50, min_samples_split=20, n_estimators=500
[CV 1/5; 64/128] END criterion=gini, max_depth=50, min_samples_split=20, n_estimators=500;, score=0.554 total time=   0.5s
[CV 2/5; 64/128] START criterion=gini, max_depth=50, min_samples_split=20, n_estimators=500
[CV 2/5; 64/128] END criterion=gini, max_depth=50, min_samples_split=20, n_estimators=500;, score=0.515 total time=   0.5s
[CV 3/5; 64/128] 

[CV 1/5; 71/128] END criterion=entropy, max_depth=5, min_samples_split=5, n_estimators=200;, score=0.492 total time=   0.1s
[CV 2/5; 71/128] START criterion=entropy, max_depth=5, min_samples_split=5, n_estimators=200
[CV 2/5; 71/128] END criterion=entropy, max_depth=5, min_samples_split=5, n_estimators=200;, score=0.512 total time=   0.1s
[CV 3/5; 71/128] START criterion=entropy, max_depth=5, min_samples_split=5, n_estimators=200
[CV 3/5; 71/128] END criterion=entropy, max_depth=5, min_samples_split=5, n_estimators=200;, score=0.358 total time=   0.1s
[CV 4/5; 71/128] START criterion=entropy, max_depth=5, min_samples_split=5, n_estimators=200
[CV 4/5; 71/128] END criterion=entropy, max_depth=5, min_samples_split=5, n_estimators=200;, score=0.443 total time=   0.1s
[CV 5/5; 71/128] START criterion=entropy, max_depth=5, min_samples_split=5, n_estimators=200
[CV 5/5; 71/128] END criterion=entropy, max_depth=5, min_samples_split=5, n_estimators=200;, score=0.466 total time=   0.1s
[CV 1/5;

[CV 4/5; 78/128] END criterion=entropy, max_depth=5, min_samples_split=20, n_estimators=100;, score=0.433 total time=   0.1s
[CV 5/5; 78/128] START criterion=entropy, max_depth=5, min_samples_split=20, n_estimators=100
[CV 5/5; 78/128] END criterion=entropy, max_depth=5, min_samples_split=20, n_estimators=100;, score=0.452 total time=   0.1s
[CV 1/5; 79/128] START criterion=entropy, max_depth=5, min_samples_split=20, n_estimators=200
[CV 1/5; 79/128] END criterion=entropy, max_depth=5, min_samples_split=20, n_estimators=200;, score=0.484 total time=   0.1s
[CV 2/5; 79/128] START criterion=entropy, max_depth=5, min_samples_split=20, n_estimators=200
[CV 2/5; 79/128] END criterion=entropy, max_depth=5, min_samples_split=20, n_estimators=200;, score=0.492 total time=   0.1s
[CV 3/5; 79/128] START criterion=entropy, max_depth=5, min_samples_split=20, n_estimators=200
[CV 3/5; 79/128] END criterion=entropy, max_depth=5, min_samples_split=20, n_estimators=200;, score=0.374 total time=   0.1s

[CV 2/5; 86/128] END criterion=entropy, max_depth=10, min_samples_split=5, n_estimators=100;, score=0.512 total time=   0.1s
[CV 3/5; 86/128] START criterion=entropy, max_depth=10, min_samples_split=5, n_estimators=100
[CV 3/5; 86/128] END criterion=entropy, max_depth=10, min_samples_split=5, n_estimators=100;, score=0.400 total time=   0.1s
[CV 4/5; 86/128] START criterion=entropy, max_depth=10, min_samples_split=5, n_estimators=100
[CV 4/5; 86/128] END criterion=entropy, max_depth=10, min_samples_split=5, n_estimators=100;, score=0.472 total time=   0.1s
[CV 5/5; 86/128] START criterion=entropy, max_depth=10, min_samples_split=5, n_estimators=100
[CV 5/5; 86/128] END criterion=entropy, max_depth=10, min_samples_split=5, n_estimators=100;, score=0.556 total time=   0.1s
[CV 1/5; 87/128] START criterion=entropy, max_depth=10, min_samples_split=5, n_estimators=200
[CV 1/5; 87/128] END criterion=entropy, max_depth=10, min_samples_split=5, n_estimators=200;, score=0.554 total time=   0.2s

[CV 2/5; 94/128] END criterion=entropy, max_depth=10, min_samples_split=20, n_estimators=100;, score=0.530 total time=   0.1s
[CV 3/5; 94/128] START criterion=entropy, max_depth=10, min_samples_split=20, n_estimators=100
[CV 3/5; 94/128] END criterion=entropy, max_depth=10, min_samples_split=20, n_estimators=100;, score=0.414 total time=   0.1s
[CV 4/5; 94/128] START criterion=entropy, max_depth=10, min_samples_split=20, n_estimators=100
[CV 4/5; 94/128] END criterion=entropy, max_depth=10, min_samples_split=20, n_estimators=100;, score=0.484 total time=   0.1s
[CV 5/5; 94/128] START criterion=entropy, max_depth=10, min_samples_split=20, n_estimators=100
[CV 5/5; 94/128] END criterion=entropy, max_depth=10, min_samples_split=20, n_estimators=100;, score=0.535 total time=   0.1s
[CV 1/5; 95/128] START criterion=entropy, max_depth=10, min_samples_split=20, n_estimators=200
[CV 1/5; 95/128] END criterion=entropy, max_depth=10, min_samples_split=20, n_estimators=200;, score=0.543 total tim

[CV 2/5; 102/128] END criterion=entropy, max_depth=20, min_samples_split=5, n_estimators=100;, score=0.493 total time=   0.1s
[CV 3/5; 102/128] START criterion=entropy, max_depth=20, min_samples_split=5, n_estimators=100
[CV 3/5; 102/128] END criterion=entropy, max_depth=20, min_samples_split=5, n_estimators=100;, score=0.458 total time=   0.1s
[CV 4/5; 102/128] START criterion=entropy, max_depth=20, min_samples_split=5, n_estimators=100
[CV 4/5; 102/128] END criterion=entropy, max_depth=20, min_samples_split=5, n_estimators=100;, score=0.457 total time=   0.1s
[CV 5/5; 102/128] START criterion=entropy, max_depth=20, min_samples_split=5, n_estimators=100
[CV 5/5; 102/128] END criterion=entropy, max_depth=20, min_samples_split=5, n_estimators=100;, score=0.535 total time=   0.1s
[CV 1/5; 103/128] START criterion=entropy, max_depth=20, min_samples_split=5, n_estimators=200
[CV 1/5; 103/128] END criterion=entropy, max_depth=20, min_samples_split=5, n_estimators=200;, score=0.565 total tim

[CV 4/5; 109/128] END criterion=entropy, max_depth=20, min_samples_split=20, n_estimators=50;, score=0.446 total time=   0.1s
[CV 5/5; 109/128] START criterion=entropy, max_depth=20, min_samples_split=20, n_estimators=50
[CV 5/5; 109/128] END criterion=entropy, max_depth=20, min_samples_split=20, n_estimators=50;, score=0.554 total time=   0.1s
[CV 1/5; 110/128] START criterion=entropy, max_depth=20, min_samples_split=20, n_estimators=100
[CV 1/5; 110/128] END criterion=entropy, max_depth=20, min_samples_split=20, n_estimators=100;, score=0.571 total time=   0.1s
[CV 2/5; 110/128] START criterion=entropy, max_depth=20, min_samples_split=20, n_estimators=100
[CV 2/5; 110/128] END criterion=entropy, max_depth=20, min_samples_split=20, n_estimators=100;, score=0.522 total time=   0.1s
[CV 3/5; 110/128] START criterion=entropy, max_depth=20, min_samples_split=20, n_estimators=100
[CV 3/5; 110/128] END criterion=entropy, max_depth=20, min_samples_split=20, n_estimators=100;, score=0.404 tot

[CV 4/5; 117/128] END criterion=entropy, max_depth=50, min_samples_split=5, n_estimators=50;, score=0.436 total time=   0.1s
[CV 5/5; 117/128] START criterion=entropy, max_depth=50, min_samples_split=5, n_estimators=50
[CV 5/5; 117/128] END criterion=entropy, max_depth=50, min_samples_split=5, n_estimators=50;, score=0.537 total time=   0.1s
[CV 1/5; 118/128] START criterion=entropy, max_depth=50, min_samples_split=5, n_estimators=100
[CV 1/5; 118/128] END criterion=entropy, max_depth=50, min_samples_split=5, n_estimators=100;, score=0.556 total time=   0.1s
[CV 2/5; 118/128] START criterion=entropy, max_depth=50, min_samples_split=5, n_estimators=100
[CV 2/5; 118/128] END criterion=entropy, max_depth=50, min_samples_split=5, n_estimators=100;, score=0.496 total time=   0.1s
[CV 3/5; 118/128] START criterion=entropy, max_depth=50, min_samples_split=5, n_estimators=100
[CV 3/5; 118/128] END criterion=entropy, max_depth=50, min_samples_split=5, n_estimators=100;, score=0.389 total time= 

[CV 4/5; 125/128] END criterion=entropy, max_depth=50, min_samples_split=20, n_estimators=50;, score=0.450 total time=   0.1s
[CV 5/5; 125/128] START criterion=entropy, max_depth=50, min_samples_split=20, n_estimators=50
[CV 5/5; 125/128] END criterion=entropy, max_depth=50, min_samples_split=20, n_estimators=50;, score=0.512 total time=   0.1s
[CV 1/5; 126/128] START criterion=entropy, max_depth=50, min_samples_split=20, n_estimators=100
[CV 1/5; 126/128] END criterion=entropy, max_depth=50, min_samples_split=20, n_estimators=100;, score=0.593 total time=   0.1s
[CV 2/5; 126/128] START criterion=entropy, max_depth=50, min_samples_split=20, n_estimators=100
[CV 2/5; 126/128] END criterion=entropy, max_depth=50, min_samples_split=20, n_estimators=100;, score=0.515 total time=   0.1s
[CV 3/5; 126/128] START criterion=entropy, max_depth=50, min_samples_split=20, n_estimators=100
[CV 3/5; 126/128] END criterion=entropy, max_depth=50, min_samples_split=20, n_estimators=100;, score=0.389 tot

In [6]:
# Build the model with best parameters and test
rf_model.build_test_model_with_best_params()

print(f"Best Parameters {rf_model.best_params}")
print(f"F1 Score {rf_model.f1}")
print(f"Precision Score {rf_model.precision}")
print(f"Recall Score {rf_model.recall}")
print(f"Accuracy Score {rf_model.accuracy}")

Best Parameters {'criterion': 'gini', 'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}
F1 Score 0.5590062111801243
Precision Score 0.75
Recall Score 0.44554455445544555
Accuracy Score 0.7717041800643086


### 3.5 Neural Networks

In [None]:
# Define the parameter grid for grid search
nn_param_grid = {
    'hidden_layer_sizes': [(10, 10), (50, 50), (10, 50), (10, 10, 10), (50, 50, 50), (50, 10, 50), (10, 50, 10)],
    'activation': ['relu', 'logistic', 'tanh'],
    'solver': ['adam', 'sgd', 'lbfgs'],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.01, 0.001, 0.0001],
    'max_iter': [50, 100, 500],
}

nn_model = CustomGridSearchCV(X_train=X_train_promapen, 
                              y_train=y_train_promapen,
                              X_test=X_test_promapen,
                              y_test=y_test_promapen,
                              classifier=MLPClassifier,
                              param_grid=nn_param_grid)

# Estimate best parameters for Neural Networks
nn_model.estimate_best_params()


In [None]:
nn_model.build_test_model_with_best_params()

# Build the model with best parameters and test
print(f"Best Parameters {nn_model.best_params}")
print(f"F1 Score {nn_model.f1}")
print(f"Precision Score {nn_model.precision}")
print(f"Recall Score {nn_model.recall}")
print(f"Accuracy Score {nn_model.accuracy}")
