In [None]:
## Import libaries
import os

## Data analysis and wrangling
import numpy as np
import pandas as pd
import random as rnd

## Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import set_config
from ydata_profiling import ProfileReport
%matplotlib inline 
from scipy.stats import boxcox

# Metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import matthews_corrcoef


# Machine learning_ Classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier

# # Model selection
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


#Palette
palette = ['#328ca9', '#0e6ea9', '#2c4ea3', '#193882', '#102446']

# Set the style of the visualization
sns.set(style="whitegrid")

# Set the configuration of sklearn
SEED = 42 # for reproducibility

In [None]:
# Read the data



# Specify the data types for columns with mixed types
dtype_spec = {
    'cap-diameter': 'float16',
    'stem-height': 'float16',
    'stem-width': 'float16',
    'does-bruise-or-bleed':'category',
    'has-ring':'category'
}

train_df = pd.read_csv(r'Output\train_cleaned.csv',dtype=dtype_spec)
test_df = pd.read_csv(r'Output\test_cleaned.csv',dtype=dtype_spec)
y = pd.read_csv(r'Output\target.csv',dtype='category')

# visualizing pipeline
set_config(display='diagram')

from utils import PreprocessData
from sklearn.preprocessing import LabelEncoder

# Preprocess the data
X, preprocessor = PreprocessData(train_df)
X1, preprocessor = PreprocessData(test_df)
# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(y).ravel()  

# Split the entire data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from utils import MCC

models = {
    # "Logistic Regression": LogisticRegression(random_state=SEED),
    # "Random Forest Classifier": RandomForestClassifier(random_state=SEED),
    "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=SEED),
    # "XGBClassifier": XGBClassifier(random_state=SEED),
    # "MLP Classifier": MLPClassifier(random_state=SEED),
    # "Extra Trees Classifier": ExtraTreesClassifier(random_state=SEED),
    # "AdaBoost Classifier": AdaBoostClassifier(estimator = DecisionTreeClassifier(max_depth=1), algorithm='SAMME',random_state=SEED),
    # "Dummy Classifier": DummyClassifier(strategy='most_frequent',random_state=SEED)  # DummyClassifier for sanity check
}


print ('=' * 100)
print ('Loading model:', list(models.keys())[i])
with open((f"/kaggle/working/{list(models.keys())[i]}"+"_tuning.pkl" if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ else f"Output\\{list(models.keys())[i]}"+"_tuning.pkl"), 'rb') as file:
    model = load(file)
print('Model-loading success:', list(models.keys())[i], 'Best Parameters:', model.get_params())


print('Model-refitting:', list(models.keys())[i], 'Best Parameters:', model.get_params())
model.fit(X_train, y_train)

# Make predictions
print ('Predicting')
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)

# Evaluate Train and val dataset
MCC_train = MCC(y_train, y_train_pred)
MCC_val = MCC(y_val, y_val_pred)

print('Model prediction success:', list(models.keys())[i], 'MCC_train:', MCC_train, ' , MCC_val:', MCC_val)