In [7]:
import pandas as pd
import os
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
#import random forest classifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from src.logger import logging
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import yaml
from sklearn.feature_selection import SelectKBest, mutual_info_classif


In [21]:
X_train = pd.read_csv("artifacts/transformed_data/train_features.csv")
y_train = pd.read_csv("artifacts/transformed_data/train_labels.csv")
X_test = pd.read_csv("artifacts/transformed_data/test_features.csv")
y_test = pd.read_csv("artifacts/transformed_data/test_labels.csv")

raw_data = pd.read_csv("artifacts/data/raw_data.csv").drop(columns = ["Id"])
X, y = raw_data.drop(columns = ["Cover_Type"]), raw_data["Cover_Type"]

In [28]:
# get the mutual info scores of the features and see it as a dataframe
mutual_info = mutual_info_classif(X, y.values.ravel())
mutual_info = pd.DataFrame(mutual_info)
mutual_info.index = X.columns
mutual_info.columns = ['mutual_info']
mutual_info.sort_values(by='mutual_info', ascending=False)

cols = mutual_info[:15].index.to_list()


In [38]:
[X[cols].nunique() < 10]

[Elevation                             False
 Aspect                                False
 Slope                                 False
 Horizontal_Distance_To_Hydrology      False
 Vertical_Distance_To_Hydrology        False
 Horizontal_Distance_To_Roadways       False
 Hillshade_9am                         False
 Hillshade_Noon                        False
 Hillshade_3pm                         False
 Horizontal_Distance_To_Fire_Points    False
 Wilderness_Area1                       True
 Wilderness_Area2                       True
 Wilderness_Area3                       True
 Wilderness_Area4                       True
 Soil_Type1                             True
 dtype: bool]

In [31]:
log_reg = LogisticRegression()
rf = RandomForestClassifier()
xgb = XGBClassifier()
dt = DecisionTreeClassifier()
grad_boost = GradientBoostingClassifier()
ada_boost = AdaBoostClassifier()
et = ExtraTreeClassifier()

models = {
    "Random Forest": rf,
    "XGBoost": xgb,
    "Decision Tree": dt,
    "Gradient Boosting": grad_boost,
    "Extra Tree": et
}

In [32]:
#unpack the dict
for model_name, model in models.items():
    print(model_name)

Random Forest
XGBoost
Decision Tree
Gradient Boosting
Extra Tree


In [33]:
def evaluate_models(models:dict, X_train:np.ndarray, y_train:np.ndarray, X_test:np.ndarray, y_test:np.ndarray) -> pd.DataFrame:
    """Evaluates a number of models using the same training and testing datasets.
    Args:
        models (dict): A dictionary of models to evaluate
        X_train (np.ndarray): Training features
        y_train (np.ndarray): Training labels
        X_test (np.ndarray): Testing features
        y_test (np.ndarray): Testing labels
    
        models = {"model_name" : model}
    Returns:
        pd.DataFrame: A dataframe of model names and their respective scores
    """
    accuracies = precisions = recalls = f1s  = np.zeros(len(models)) 

    for model_idx, (model_name, model) in enumerate(models.items()):
        logging.info(f"Evaluating {model_name}")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracies[model_idx] = accuracy_score(y_test, y_pred)
        precisions[model_idx] = precision_score(y_test, y_pred, average="weighted")
        recalls[model_idx] = recall_score(y_test, y_pred, average="weighted")
        f1s[model_idx] = f1_score(y_test, y_pred, average="weighted")
        logging.info(f"Score for {model_name} is {model.score(X_test, y_test)}")
        
    return pd.DataFrame({"Model": models.keys(), 
                         "Model_specs" : models.values(),
                         "Accuracy": accuracies, 
                         "Precision": precisions, 
                         "Recall": recalls, "F1": f1s})

In [34]:
with open("resources/grid_search_params.yml", "r") as f:
    params = yaml.safe_load(f)

In [35]:
from src.components.model_training import ModelTraining
model_training = ModelTraining()
results, best_model, best_model_name = model_training.fit_models(X_train_path= "artifacts/transformed_data/train_features.csv",
                            X_test_path= "artifacts/transformed_data/test_features.csv",
                            y_train_path= "artifacts/transformed_data/train_labels.csv",
                            y_test_path= "artifacts/transformed_data/test_labels.csv")

  _warn_prf(average, modifier, msg_start, len(result))


In [59]:
# import grid_search
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


param_grid = {
    "n_estimators": [50, 100, 200, 300, 400, 500],
    "max_depth": [1, 3, 5, 7, 9],
    "min_samples_split": [2, 4, 6, 8, 10],
    "min_samples_leaf": [1, 2, 3, 4, 5],
    "max_features": [ "log2"],
    "bootstrap": [True],
    "criterion": ["gini"]
}

classif = GridSearchCV(estimator=RandomForestClassifier(),
                             param_grid=param_grid,
                              cv=5,
                              verbose=2,
                              scoring="accuracy",
                              n_jobs=-1)

classif.fit(X_train.values, y_train.values.ravel())

Fitting 5 folds for each of 750 candidates, totalling 3750 fits
[CV] END bootstrap=True, criterion=gini, max_depth=1, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, criterion=gini, max_depth=1, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, criterion=gini, max_depth=1, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, criterion=gini, max_depth=1, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, criterion=gini, max_depth=1, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, criterion=gini, max_depth=1, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END bootstrap=True, criterion=

In [60]:
classif.score(X_test.values, y_test.values.ravel()),  classif.score(X_train.values, y_train.values.ravel())

(0.7787698412698413, 0.8253968253968254)

In [1]:
import yaml
import optuna
from optuna.trial import Trial 

In [2]:
with open("/Users/archismanchakraborti/Desktop/python_files/iNeuron-internship-ForestCover_Prediction/resources/grid_search_params.yml", "r") as f:
    params = yaml.safe_load(f)
params


{'Decision Tree': {'class_weight': ['balanced', None],
  'criterion': ['gini', 'entropy'],
  'max_depth': [3, 5, 7, 9, 12, 15, 17, 25],
  'max_features': ['auto', 'sqrt', 'log2', None],
  'max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
  'min_impurity_decrease': [0.0, 0.1, 0.2, 0.3, 0.4],
  'min_impurity_split': [None, 0.1, 0.2, 0.3, 0.4],
  'min_samples_leaf': [1, 2, 3, 4, 5],
  'min_samples_split': [2, 3, 5, 7, 9],
  'presort': [True, False],
  'splitter': ['best', 'random']},
 'Extra Tree': {'bootstrap': [True, False],
  'class_weight': ['balanced', None],
  'criterion': ['gini', 'entropy'],
  'max_depth': [3, 5, 7, 9, 12, 15, 17, 25],
  'max_features': ['auto', 'sqrt', 'log2', None],
  'max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
  'min_impurity_decrease': [0.0, 0.1, 0.2, 0.3, 0.4],
  'min_impurity_split': [None, 0.1, 0.2, 0.3, 0.4],
  'min_samples_leaf': [1, 2, 3, 4, 5],
  'min_samples_split': [2, 3, 5, 7, 9],
  'min_weight_fraction_leaf': 

In [10]:
params.keys()

dict_keys(['Decision Tree', 'Extra Tree', 'Gradient Boosting', 'Random Forest', 'XGBoost'])

In [109]:
from src.components.data_transformation import DataTransformation

transformer = DataTransformation()
preprocessor, label_encoder = transformer.get_data_preprocessor()

data = pd.read_csv("artifacts/data/raw_data.csv").drop(columns = ["Id", "Cover_Type"])
data.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,0
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,0
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,0
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,0
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,0


In [105]:
# Get the first component of the preprocessor
preprocessor.named_steps["remover"].fit(data)

# Get the columns to be dropped
data = preprocessor.named_steps["remover"].transform(data)

preprocessor.named_steps["transformer"].fit(data)

data = preprocessor.named_steps["transformer"].transform(data)

In [110]:
preprocessor.fit(data)

SpecificationError: nested renamer is not supported

In [111]:
pd.read_csv("artifacts/transformed_data/test_features.csv")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.768159,-0.089131,-0.055980,-0.654535,-0.804600,-0.927413,0.895730,0.748495,-0.331932,-0.119038,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,-0.235932,-0.388056,-0.293072,-0.246399,-0.211107,0.422037,1.026447,0.265014,-0.614818,0.752883,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,1.905018,1.432672,1.722207,0.137728,1.305597,1.129107,-2.895056,-0.921712,1.539462,-0.061818,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,1.059716,1.169980,0.418203,-0.942631,-0.639741,2.443203,-1.653247,0.704542,1.670024,-0.667621,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,-1.423208,-0.840974,2.433482,-0.174376,0.745076,-1.110950,0.405542,-3.470976,-2.616774,-0.937371,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3019,0.238497,-0.523931,0.773841,-0.764972,-0.540826,2.401080,1.222522,-0.789854,-1.528754,-1.264341,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3020,-0.455085,-0.895324,0.655295,-0.678543,-0.408938,-1.042500,0.568938,-1.405193,-1.224109,0.098034,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
3021,0.079551,-0.623573,-0.530163,-0.222392,-0.870544,-0.907856,0.830372,0.045250,-0.506016,-1.109030,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3022,-0.982495,0.246028,-0.293072,0.646698,2.987161,-0.605470,0.340184,1.319882,0.451442,-0.994591,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


In [14]:
from src.utils import load_pickle
preprocessor = load_pickle("artifacts/preprocessor/preprocessor.pkl")
model = load_pickle("artifacts/model_data/model.pkl")
import json

In [2]:
model

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Value</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>bootstrap</th>\n      <td>True</td>\n    </tr>\n    <tr>\n      <th>ccp_alpha</th>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>class_weight</th>\n      <td>None</td>\n    </tr>\n    <tr>\n      <th>criterion</th>\n      <td>gini</td>\n    </tr>\n    <tr>\n      <th>max_depth</th>\n      <td>None</td>\n    </tr>\n    <tr>\n      <th>max_features</th>\n      <td>sqrt</td>\n    </tr>\n    <tr>\n      <th>max_leaf_nodes</th>\n      <td>None</td>\n    </tr>\n    <tr>\n      <th>max_samples</th>\n      <td>None</td>\n    </tr>\n    <tr>\n      <th>min_impurity_decrease</th>\n      <td>0.0</td>\n    </tr>\n    <tr>\n      <th>min_samples_leaf</th>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>min_samples_split</th>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>min_weight_fraction_leaf</th>\n      <td