In [29]:
import pandas as pd
import os
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
#import random forest classifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from src.logger import logging
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import yaml


In [30]:
X_train = pd.read_csv("artifacts/transformed_data/train_features.csv")
y_train = pd.read_csv("artifacts/transformed_data/train_labels.csv")
X_test = pd.read_csv("artifacts/transformed_data/test_features.csv")
y_test = pd.read_csv("artifacts/transformed_data/test_labels.csv")

In [31]:
log_reg = LogisticRegression()
rf = RandomForestClassifier()
xgb = XGBClassifier()
dt = DecisionTreeClassifier()
grad_boost = GradientBoostingClassifier()
ada_boost = AdaBoostClassifier()
et = ExtraTreeClassifier()

models = {
    "Random Forest": rf,
    "XGBoost": xgb,
    "Decision Tree": dt,
    "Gradient Boosting": grad_boost,
    "Extra Tree": et
}

In [32]:
#unpack the dict
for model_name, model in models.items():
    print(model_name)

Random Forest
XGBoost
Decision Tree
Gradient Boosting
Extra Tree


In [33]:
def evaluate_models(models:dict, X_train:np.ndarray, y_train:np.ndarray, X_test:np.ndarray, y_test:np.ndarray) -> pd.DataFrame:
    """Evaluates a number of models using the same training and testing datasets.
    Args:
        models (dict): A dictionary of models to evaluate
        X_train (np.ndarray): Training features
        y_train (np.ndarray): Training labels
        X_test (np.ndarray): Testing features
        y_test (np.ndarray): Testing labels
    
        models = {"model_name" : model}
    Returns:
        pd.DataFrame: A dataframe of model names and their respective scores
    """
    accuracies = precisions = recalls = f1s  = np.zeros(len(models)) 

    for model_idx, (model_name, model) in enumerate(models.items()):
        logging.info(f"Evaluating {model_name}")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracies[model_idx] = accuracy_score(y_test, y_pred)
        precisions[model_idx] = precision_score(y_test, y_pred, average="weighted")
        recalls[model_idx] = recall_score(y_test, y_pred, average="weighted")
        f1s[model_idx] = f1_score(y_test, y_pred, average="weighted")
        logging.info(f"Score for {model_name} is {model.score(X_test, y_test)}")
        
    return pd.DataFrame({"Model": models.keys(), 
                         "Model_specs" : models.values(),
                         "Accuracy": accuracies, 
                         "Precision": precisions, 
                         "Recall": recalls, "F1": f1s})

In [34]:
with open("resources/grid_search_params.yml", "r") as f:
    params = yaml.safe_load(f)

In [35]:
from src.components.model_training import ModelTraining
model_training = ModelTraining()
results, best_model, best_model_name = model_training.fit_models(X_train_path= "artifacts/transformed_data/train_features.csv",
                            X_test_path= "artifacts/transformed_data/test_features.csv",
                            y_train_path= "artifacts/transformed_data/train_labels.csv",
                            y_test_path= "artifacts/transformed_data/test_labels.csv")

  _warn_prf(average, modifier, msg_start, len(result))


In [59]:
# import grid_search
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


param_grid = {
    "n_estimators": [50, 100, 200, 300, 400, 500],
    "max_depth": [1, 3, 5, 7, 9],
    "min_samples_split": [2, 4, 6, 8, 10],
    "min_samples_leaf": [1, 2, 3, 4, 5],
    "max_features": [ "log2"],
    "bootstrap": [True],
    "criterion": ["gini"]
}

classif = GridSearchCV(estimator=RandomForestClassifier(),
                             param_grid=param_grid,
                              cv=5,
                              verbose=2,
                              scoring="accuracy",
                              n_jobs=-1)

classif.fit(X_train.values, y_train.values.ravel())

Fitting 5 folds for each of 750 candidates, totalling 3750 fits
[CV] END bootstrap=True, criterion=gini, max_depth=1, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, criterion=gini, max_depth=1, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, criterion=gini, max_depth=1, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, criterion=gini, max_depth=1, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, criterion=gini, max_depth=1, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, criterion=gini, max_depth=1, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END bootstrap=True, criterion=

In [60]:
classif.score(X_test.values, y_test.values.ravel()),  classif.score(X_train.values, y_train.values.ravel())

(0.7787698412698413, 0.8253968253968254)

In [1]:
import yaml
import optuna
from optuna.trial import Trial 

In [2]:
with open("/Users/archismanchakraborti/Desktop/python_files/iNeuron-internship-ForestCover_Prediction/resources/grid_search_params.yml", "r") as f:
    params = yaml.safe_load(f)
params


{'Decision Tree': {'class_weight': ['balanced', None],
  'criterion': ['gini', 'entropy'],
  'max_depth': [3, 5, 7, 9, 12, 15, 17, 25],
  'max_features': ['auto', 'sqrt', 'log2', None],
  'max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
  'min_impurity_decrease': [0.0, 0.1, 0.2, 0.3, 0.4],
  'min_impurity_split': [None, 0.1, 0.2, 0.3, 0.4],
  'min_samples_leaf': [1, 2, 3, 4, 5],
  'min_samples_split': [2, 3, 5, 7, 9],
  'presort': [True, False],
  'splitter': ['best', 'random']},
 'Extra Tree': {'bootstrap': [True, False],
  'class_weight': ['balanced', None],
  'criterion': ['gini', 'entropy'],
  'max_depth': [3, 5, 7, 9, 12, 15, 17, 25],
  'max_features': ['auto', 'sqrt', 'log2', None],
  'max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
  'min_impurity_decrease': [0.0, 0.1, 0.2, 0.3, 0.4],
  'min_impurity_split': [None, 0.1, 0.2, 0.3, 0.4],
  'min_samples_leaf': [1, 2, 3, 4, 5],
  'min_samples_split': [2, 3, 5, 7, 9],
  'min_weight_fraction_leaf': 

In [10]:
params.keys()

dict_keys(['Decision Tree', 'Extra Tree', 'Gradient Boosting', 'Random Forest', 'XGBoost'])

In [22]:
from resources.objective_func import objective
