In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import string
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')
import yaml
# import ruamel.yaml as ryaml
import json
# import ast

# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, BaggingClassifier, ExtraTreesClassifier, 
                              HistGradientBoostingClassifier, StackingClassifier, VotingClassifier)
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import (balanced_accuracy_score, f1_score,
                             accuracy_score, confusion_matrix)

from sklearn.model_selection import train_test_split, StratifiedKFold

from src.constants import *
from src.utils import load_yaml, parameter_tuning, stage_2_processing_function,eval_metrics
# from src.config.configuration_manager import ConfigurationManager
# from src.entity.entity_config import DataSplitConf
# from src.components.stage_5_model_tuning_and_tracking
os.chdir('f:\\iNeuron\\Projects\\Scania_Truck_Failures')

In [2]:
df = pd.read_csv("F:\iNeuron\Projects\scania_failures_2\\artifacts\data\processed\stage_1_processing\preprocessed_train_data.csv").iloc[:10000:]
print("Shape: ",df.shape)
print("NA: ", df.isna().sum().unique())
print("Target Value_Counts: ", df['class'].value_counts())
# os.remove("F:\iNeuron\Projects\scania_failures_2\\artifacts\preprocessor\preprocessor.joblib")
# train,test = train_test_split(df, test_size = 0.25, random_state=8)
transformed_train_df = pd.read_csv("F:\iNeuron\Projects\scania_failures_2\\artifacts\data\processed\stage_2_processing\processed_train_data.csv")
transformed_test_df = pd.read_csv("F:\iNeuron\Projects\scania_failures_2\\artifacts\data\processed\stage_2_processing\processed_test_data.csv")
print(f"\nTransformed_train_df.shape: {transformed_train_df.shape},\nTransformed_test_df.shape: {transformed_test_df.shape}")
x_train, y_train = transformed_train_df.drop(columns='class'), transformed_train_df['class']
x_test, y_test = transformed_test_df.drop(columns = 'class'), transformed_test_df['class']
print(f"\nx_train.shape: {x_train.shape},\ny_train.shape: {y_train.shape},\nx_test.shape: {x_test.shape},\ny_test.shape: {y_test.shape}")
print(f"\nNA in train_data after preprocessing: {x_train.isna().sum().unique()}")
print(f"NA in test_data after preprocessing: {x_test.isna().sum().unique()}")
print(f"\nValue_counts in train_data after preprocessing: {y_train.value_counts()}")
print(f"Value_counts in test_data after preprocessing: {y_test.value_counts()}")

Shape:  (10000, 171)
NA:  [   0 7782  595 2475  443  124  116  107  790  115   99  489  125  127
 3729 4459 6538 7287 7695 7938 8086 8171  128   39  126  563   81  785
   65 1601 1658 2309  718 1719]
Target Value_Counts:  class
0    9805
1     195
Name: count, dtype: int64

Transformed_train_df.shape: (2934, 171),
Transformed_test_df.shape: (976, 171)

x_train.shape: (2934, 170),
y_train.shape: (2934,),
x_test.shape: (976, 170),
y_test.shape: (976,)

NA in train_data after preprocessing: [0]
NA in test_data after preprocessing: [0]

Value_counts in train_data after preprocessing: class
0    1467
1    1467
Name: count, dtype: int64
Value_counts in test_data after preprocessing: class
0    488
1    488
Name: count, dtype: int64


In [3]:
grad_boost_params = {"criterion": "squared_error",
        "loss": "exponential",
        'max_features': 'sqrt',
        'n_estimators': 103}

log_reg_params = {'penalty': 'l2'}

xgb_params = {'booster': 'gbtree',
        'learning_rate': 2.810313927552631,
        'n_estimators': 658,
        'tree_method': 'approx'}

grd_boost_model = GradientBoostingClassifier(**grad_boost_params)
log_reg_model = LogisticRegression(**log_reg_params)
xgb_model = XGBClassifier(**xgb_params)
estimators = [("grd_boost",grd_boost_model),
              ("log_reg",log_reg_model),
              ("xgb", xgb_model)]

final_estimator = XGBClassifier()
stacked_classifer = StackingClassifier(estimators=estimators,
                                       final_estimator=final_estimator,
                                       cv=5,
                                       passthrough = True)
voting_classifier = VotingClassifier(estimators = estimators,
                                                    voting = "hard",
                                                    weights = None,
                                                    n_jobs = -1,
                                                    verbose = True)

In [35]:
for i in range(len(estimators)):
    print(estimators[i][0])

grd_boost
log_reg
xgb


In [36]:
print([estimators[i][0] for i in range(len(estimators))])

['grd_boost', 'log_reg', 'xgb']


In [22]:
from yaml import safe_dump
yaml.dump(data = stacked_classifer.get_params(),
                  stream = open('F:\iNeuron\Projects\scania_failures_2\sample.yaml', 'w'),
                  indent = 4)

In [27]:
with open('F:\iNeuron\Projects\scania_failures_2\sample.yaml') as yaml_file:
    yaml_file = yaml.load(yaml_file, Loader = yaml.CLoader)
yaml_file

{'cv': 5,
 'estimators': [('grd_boost',
   GradientBoostingClassifier(criterion='squared_error', loss='exponential',
                              max_features='sqrt', n_estimators=103)),
  ('log_reg', LogisticRegression()),
  ('xgb',
   XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
                 colsample_bylevel=None, colsample_bynode=None,
                 colsample_bytree=None, device=None, early_stopping_rounds=None,
                 enable_categorical=False, eval_metric=None, feature_types=None,
                 gamma=None, grow_policy=None, importance_type=None,
                 interaction_constraints=None, learning_rate=2.810313927552631,
                 max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
                 max_delta_step=None, max_depth=None, max_leaves=None,
                 min_child_weight=None, missing=nan, monotone_constraints=None,
                 multi_strategy=None, n_estimators=658, n_jobs=None,
                 num_pa

In [20]:
yaml_file['estimators'][1][1]

In [None]:
b_estimators = dict(grd_boost = grd_boost_model,
                    log_reg = log_reg_model,
                    xgb =  xgb_model)
estimators_ = list(zip(b_estimators.keys(),b_estimators.values()))
for i in range(len(estimators_)):
    estimators_[0][i] = str(estimators_[0][i])

In [None]:
b_estimators

In [None]:
estimators_[0]

In [None]:
keys, values = zip(*estimators_)

In [None]:
keys

In [None]:
sample = {}
for i in range(len(keys)):
    sample[keys[i]] = values[i]
sample

In [None]:
stacked_classifer.fit(X = x_train, y = y_train)
y_pred_ = stacked_classifer.predict(X = x_test)
cost_ = eval_metrics(y_true = y_test, y_pred=y_pred_)
print(cost_)

In [None]:
voting_classifier.fit(x_train, y_train)
y_pred = voting_classifier.predict(x_test)
cost = eval_metrics(y_true = y_test , y_pred = y_pred)
cost

In [None]:
voting_classifier.get_params()['flatten_transform']

In [None]:
StackingClassifier()

In [None]:
key, value = zip(*stacked_classifer.get_params()['estimators'])

In [None]:
report = {}
report['estimators'] = stacked_classifer.get_params()['estimators']
report['final_estimator'] = stacked_classifer.get_params()['final_estimator']
report


In [None]:
report_1 = {}
report_1['stack_method'] = stacked_classifer.get_params()['stack_method']
report_1['passthrough'] = stacked_classifer.get_params()['passthrough']
report_1

In [None]:
for i in report_1.keys():
    report[i] = report_1[i]

In [None]:
report.keys()

In [None]:
sc = StackingClassifier(**report)
sc


In [None]:
sc.get_params()

In [None]:
key

In [None]:
value

In [None]:
{'Stacked_Classifier': {'Logistic_Regression': {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 
'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'saga', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}, 'SGD_Classifier': {'alpha': 0.0001, 'average': False, 'class_weight': None, 'early_stopping': False, 'epsilon': 0.1, 'eta0': 0.0, 'fit_intercept': True, 'l1_ratio': 
0.15, 'learning_rate': 'optimal', 'loss': 'huber', 'max_iter': 1000, 'n_iter_no_change': 5, 'n_jobs': None, 'penalty': 'l2', 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'tol': 0.001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}, 'Decision_Tree_Classifier': {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'log_loss', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}, 'KNN_Classifier': {'algorithm': 'brute', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 11, 'p': 2, 'weights': 'uniform'}}}

In [None]:
yaml_r = yaml.YAML(typ='rt')

# Read data from the YAML file
with open('params.yaml') as yaml_file:
    params_config = yaml_r.load(yaml_file)

model_name = "Stacked_Classifier"
model_class = stacked_classifer

In [None]:
params_config

In [None]:
# criterion='squared_error', 
# loss='exponential',
# max_features='sqrt', 
# n_estimators=103

grad_boost_params

In [None]:
model.get_params()['grd_boost'].get_params()

In [None]:
model.get_params()['passthrough']

In [None]:
model.get_params()

In [None]:
import optuna
def optuna_objective(trial):
        space_optuna = {}
        for key,value in params_config['optuna'][model_name].items():
            space_optuna[key] = eval(value)
        model = model_class
        model.set_params(**space_optuna)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        cost = eval_metrics(y_true = y_test , y_pred = y_pred)
    
        return cost
tuner_report = {}
find_param=optuna.create_study(direction = "minimize")
find_param.optimize(optuna_objective,n_trials=1)

tuner_report['Optuna'] = {'cost':find_param.best_value, 'params': find_param.best_params}
print (f"Optuna: {model_name} --- {tuner_report['Optuna']}\n\n")

In [None]:
find_param.best_params

In [None]:
data = {'hidden_layer_sizes': (500, 300, 200, 150),
 'activation': 'tanh',
 'learning_rate': 'invscaling',
 'max_iter': 783}

In [None]:
model.get_params()['grd_boost'].get_params()

In [None]:
model.get_params()['log_reg']

In [None]:
model.get_params()['xgb'].get_params()

In [29]:
import ruamel.yaml as yaml
yaml_ = yaml.YAML()
with open('F:\iNeuron\Projects\scania_failures_2\sample.yaml', 'w') as f:
    yaml_.dump(stacked_classifer.get_params(), f)


RepresenterError: cannot represent an object: GradientBoostingClassifier(criterion='squared_error', loss='exponential',
                           max_features='sqrt', n_estimators=103)

In [None]:
from src.utils import save_yaml


save_yaml(tuner_report,'F:\iNeuron\Projects\scania_failures_2\sample.yaml')

In [None]:
file_path = r'F:\iNeuron\Projects\scania_failures_2\sample.json'

with open(file_path, 'w') as file:
    json.dump(obj=sc, fp=file, indent = 2)

In [None]:
with open(file_path,'r') as file:
    data = json.load(file)
data

In [None]:
load_yaml(Path('F:\iNeuron\Projects\scania_failures_2\sample.yaml'))

In [None]:
import json
print(json.dumps(find_param.best_params))