In [3]:
import pandas as pd
pd.set_option('display.max_columns', None)

train = pd.read_csv('../data/train_data.csv')
test = pd.read_csv('../data/test_data.csv')

In [4]:
# Vamos a definir los datos de entrenamiento
X = train.drop(['mineralType', 'id'], axis=1)
y = train['mineralType']

In [5]:
X['planetSection'] = X['planetSection'].astype('category')
X['geoZone'] = X['geoZone'].astype('category')
X['rockSize'] = X['rockSize'].astype('category')
X['cover'] = X['cover'].astype('category')
X['magmaConcentrationDistance'] = X['magmaConcentrationDistance'].astype('category')

In [14]:
from autosklearn.experimental.askl2 import AutoSklearn2Classifier
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.metrics import accuracy

# Create sklearn pipeline for data preprocessing
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.feature_extraction import FeatureHasher
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer, recall_score, precision_score
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import set_config
import numpy as np
import mlflow
import math
from joblib import Memory
from shutil import rmtree

mlflow.sklearn.autolog()
mlflow.set_experiment("autosklearn_preprocessor")

preprocessor = ColumnTransformer([
        # (name, transformer, columns)
        ("temperatureFirstHalfPlanetRotation", make_pipeline(IterativeImputer(missing_values=-999.0), FunctionTransformer(lambda f: (f - 32) / 1.8, feature_names_out="one-to-one"), RobustScaler()), ['temperatureFirstHalfPlanetRotation']), # convert from Fahrenheit to Celsius
        ("temperatureSecondHalfPlanetRotation", StandardScaler(), ['temperatureSecondHalfPlanetRotation']), # pass through the column unchanged
        ("waterStreamDistanceX", make_pipeline(FunctionTransformer(lambda f: f * 0.3048, feature_names_out="one-to-one"), StandardScaler()), ['waterStreamDistanceX']), # convert from feet to meters
        ("waterStreamDistanceY", StandardScaler(), ['waterStreamDistanceY']), # pass through the column unchanged
        ("planetSection", OneHotEncoder(handle_unknown = "ignore"), ['planetSection']), # one-hot encode the planetSection column
        ("cover", OneHotEncoder(handle_unknown='error', drop='first'), ['cover']), # one-hot encode the cover column and drop the first column (the one with the missing values == 0)
        ("climaticZone", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), ['climaticZone']), # ordinal encode the climaticZone column TODO: drop category 3? what to do? only one row has a 3
        ("geoZone", OneHotEncoder(handle_unknown = "ignore"), ['geoZone']), # one-hot encode the geoZone column TODO: drop category 5?
        ("rockSize", OneHotEncoder(handle_unknown='ignore', drop='first'), ['rockSize']), # one-hot encode the rockSize column and drop the first column (the one with the missing values == 0)
        ("magmaConcentrationDistance", OneHotEncoder(handle_unknown = "ignore"), ['magmaConcentrationDistance']), # one-hot encode the rockSize column and drop the first column (the one with the missing values == 0) TODO: use Ordinal Encoder?
        ("mineralDensity", make_pipeline(IterativeImputer(missing_values=-999.0), RobustScaler()), ['mineralDensity']), # pass through the column unchanged
        ("detectionDepth", StandardScaler(), ['detectionDepth']), # pass through the column unchanged TODO: convert km to m?
        ("longitude", StandardScaler(), ['longitude']), # pass through the column unchanged TODO: values > 360? do x - 360
    ],
    verbose_feature_names_out=False, remainder='passthrough'
)

def euclidean_distance(x, y):
    res = []
    for i in range(len(x)):
        res.append(math.sqrt(x[i]**2 + y[i]**2))
    return np.array(res)

class CreateVariables(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
        
    def fit(self, X, y = None):
        return self

    def transform(self, X, y = None):
        # waterSteamDistance
        waterStreamDistanceX = X[:,2]
        waterStreamDistanceY = X[:,3]
        waterStreamDistance = euclidean_distance(waterStreamDistanceX, waterStreamDistanceY)
        X = np.append(X, waterStreamDistance.reshape(-1, 1), axis=1)

        # temperature (both planet rotations)
        temperatureFirstHalfPlanetRotation = X[:,0]
        temperatureSecondHalfPlanetRotation = X[:,1]
        meanTemperature = (temperatureFirstHalfPlanetRotation + temperatureSecondHalfPlanetRotation)/2
        X = np.append(X, meanTemperature.reshape(-1, 1), axis=1)

        return X

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('create_variables', CreateVariables()),
    # ('pca', PCA()),
    # ('model', AutoSklearn2Classifier(n_jobs=-1, metric=accuracy, time_left_for_this_task=60,
    #             # include={
    #             #     'data_preprocessor': ['NoPreprocessing']
    #             # }
    #          )
    # )
])

cls = AutoSklearn2Classifier(n_jobs=-1, metric=accuracy, time_left_for_this_task=1200)
cls.fit(pipe.fit_transform(X), y)

2022/04/28 20:45:22 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '8e201c87a4d94a4482013f47f900e731', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
                  transformers=[('temperatureFirstHalfPlanetRotation',
                                 Pipeline(steps=[('iterativeimputer',
                                             ...`
                  transformers=[('temperatureFirstHalfPlanetRotation',
                                 Pipeline(steps=[('iterativeimputer',
                                                  IterativeImpu...`
                ('functiontransformer',
                 FunctionTransformer(feature_names_out='one-to-one',
                   ...`
                ('functiontransformer',
                 FunctionTransformer(feature_names_out='one-to-one',
                                     func=<function <lambda...`
Perhaps you already have a cluste



AutoSklearn2Classifier(metric=accuracy, n_jobs=-1, per_run_time_limit=2880,
                       time_left_for_this_task=1200)

In [15]:
cls.sprint_statistics()

'auto-sklearn results:\n  Dataset name: 5f988b70-c723-11ec-b75c-1314a69f1795\n  Metric: accuracy\n  Best validation score: 0.762301\n  Number of target algorithm runs: 228\n  Number of successful target algorithm runs: 223\n  Number of crashed target algorithm runs: 2\n  Number of target algorithms that exceeded the time limit: 3\n  Number of target algorithms that exceeded the memory limit: 0\n'

In [6]:
cls.show_models()

{147: {'model_id': 147,
  'rank': 1,
  'cost': 0.23579748038982648,
  'ensemble_weight': 0.18,
  'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice at 0x7f302307bd50>,
  'balancing': Balancing(random_state=1),
  'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice at 0x7f30253c9f90>,
  'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice at 0x7f30253c9b50>,
  'sklearn_classifier': None},
 73: {'model_id': 73,
  'rank': 2,
  'cost': 0.2396006655574043,
  'ensemble_weight': 0.02,
  'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice at 0x7f3025537c90>,
  'balancing': Balancing(random_state=1),
  'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice at 0x7f3025872810>,
  'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice at 0x7f3025872190>,
  'skl

In [7]:
cls.leaderboard()

Unnamed: 0_level_0,rank,ensemble_weight,type,cost,duration
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
147,1,0.18,extra_trees,0.235797,49.625379
73,2,0.02,extra_trees,0.239601,47.705335
123,3,0.04,extra_trees,0.239601,52.236234
86,4,0.02,extra_trees,0.240551,52.646666
105,5,0.1,extra_trees,0.242612,57.811886
82,6,0.04,extra_trees,0.243166,90.98549
65,7,0.02,random_forest,0.243721,93.603288
64,8,0.02,extra_trees,0.244275,70.589938
124,9,0.04,extra_trees,0.244513,52.466038
117,10,0.02,extra_trees,0.245068,67.287893


In [8]:
pd.DataFrame(cls.cv_results_).sort_values(by=['rank_test_scores'], ascending=True).head(10)

Unnamed: 0,mean_test_score,mean_fit_time,params,rank_test_scores,status,budgets,param_balancing:strategy,param_classifier:__choice__,param_data_preprocessor:__choice__,param_feature_preprocessor:__choice__,param_classifier:extra_trees:bootstrap,param_classifier:extra_trees:criterion,param_classifier:extra_trees:max_depth,param_classifier:extra_trees:max_features,param_classifier:extra_trees:max_leaf_nodes,param_classifier:extra_trees:min_impurity_decrease,param_classifier:extra_trees:min_samples_leaf,param_classifier:extra_trees:min_samples_split,param_classifier:extra_trees:min_weight_fraction_leaf,param_classifier:gradient_boosting:early_stop,param_classifier:gradient_boosting:l2_regularization,param_classifier:gradient_boosting:learning_rate,param_classifier:gradient_boosting:loss,param_classifier:gradient_boosting:max_bins,param_classifier:gradient_boosting:max_depth,param_classifier:gradient_boosting:max_leaf_nodes,param_classifier:gradient_boosting:min_samples_leaf,param_classifier:gradient_boosting:scoring,param_classifier:gradient_boosting:tol,param_classifier:mlp:activation,param_classifier:mlp:alpha,param_classifier:mlp:batch_size,param_classifier:mlp:beta_1,param_classifier:mlp:beta_2,param_classifier:mlp:early_stopping,param_classifier:mlp:epsilon,param_classifier:mlp:hidden_layer_depth,param_classifier:mlp:learning_rate_init,param_classifier:mlp:n_iter_no_change,param_classifier:mlp:num_nodes_per_layer,param_classifier:mlp:shuffle,param_classifier:mlp:solver,param_classifier:mlp:tol,param_classifier:passive_aggressive:C,param_classifier:passive_aggressive:average,param_classifier:passive_aggressive:fit_intercept,param_classifier:passive_aggressive:loss,param_classifier:passive_aggressive:tol,param_classifier:random_forest:bootstrap,param_classifier:random_forest:criterion,param_classifier:random_forest:max_depth,param_classifier:random_forest:max_features,param_classifier:random_forest:max_leaf_nodes,param_classifier:random_forest:min_impurity_decrease,param_classifier:random_forest:min_samples_leaf,param_classifier:random_forest:min_samples_split,param_classifier:random_forest:min_weight_fraction_leaf,param_classifier:sgd:alpha,param_classifier:sgd:average,param_classifier:sgd:fit_intercept,param_classifier:sgd:learning_rate,param_classifier:sgd:loss,param_classifier:sgd:penalty,param_classifier:sgd:tol,param_data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__,param_data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__,param_data_preprocessor:feature_type:numerical_transformer:imputation:strategy,param_data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__,param_classifier:gradient_boosting:n_iter_no_change,param_classifier:gradient_boosting:validation_fraction,param_classifier:mlp:validation_fraction,param_classifier:sgd:epsilon,param_classifier:sgd:eta0,param_classifier:sgd:l1_ratio,param_classifier:sgd:power_t,param_data_preprocessor:feature_type:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction,param_data_preprocessor:feature_type:numerical_transformer:rescaling:quantile_transformer:n_quantiles,param_data_preprocessor:feature_type:numerical_transformer:rescaling:quantile_transformer:output_distribution,param_data_preprocessor:feature_type:numerical_transformer:rescaling:robust_scaler:q_max,param_data_preprocessor:feature_type:numerical_transformer:rescaling:robust_scaler:q_min
145,0.764203,49.625379,"{'balancing:strategy': 'none', 'classifier:__c...",1,Success,0.0,none,extra_trees,feature_type,no_preprocessing,False,gini,,0.59927,,0.0,1.0,4.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,median,minmax,,,,,,,,0.342078,,,,
110,0.760875,45.247256,"{'balancing:strategy': 'none', 'classifier:__c...",2,Success,0.0,none,extra_trees,feature_type,no_preprocessing,False,gini,,0.863752,,0.0,2.0,3.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,mean,minmax,,,,,,,,0.240172,,,,
71,0.760399,47.705335,"{'balancing:strategy': 'none', 'classifier:__c...",3,Success,0.0,none,extra_trees,feature_type,no_preprocessing,False,gini,,0.898538,,0.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,median,standardize,,,,,,,,0.363305,,,,
121,0.760399,52.236234,"{'balancing:strategy': 'weighting', 'classifie...",3,Success,0.0,weighting,extra_trees,feature_type,no_preprocessing,False,gini,,0.808444,,0.0,1.0,3.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,encoding,minority_coalescer,mean,minmax,,,,,,,,0.238964,,,,
84,0.759449,52.646666,"{'balancing:strategy': 'none', 'classifier:__c...",5,Success,0.0,none,extra_trees,feature_type,no_preprocessing,False,gini,,0.883144,,0.0,1.0,2.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,mean,robust_scaler,,,,,,,,0.099632,,,0.75,0.25
120,0.75929,54.145325,"{'balancing:strategy': 'none', 'classifier:__c...",6,Success,0.0,none,extra_trees,feature_type,no_preprocessing,False,gini,,0.916318,,0.0,2.0,5.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,no_encoding,minority_coalescer,median,robust_scaler,,,,,,,,0.024322,,,0.774575,0.234249
12,0.758815,60.2138,"{'balancing:strategy': 'weighting', 'classifie...",7,Success,0.0,weighting,extra_trees,feature_type,no_preprocessing,False,entropy,,0.958941,,0.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,mean,minmax,,,,,,,,0.003691,,,,
79,0.757864,63.492494,"{'balancing:strategy': 'none', 'classifier:__c...",8,Success,0.0,none,extra_trees,feature_type,no_preprocessing,False,gini,,0.849693,,0.0,2.0,3.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,mean,quantile_transformer,,,,,,,,0.182167,900.0,normal,,
103,0.757388,57.811886,"{'balancing:strategy': 'weighting', 'classifie...",9,Success,0.0,weighting,extra_trees,feature_type,no_preprocessing,False,gini,,0.9345,,0.0,1.0,2.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,one_hot_encoding,minority_coalescer,median,minmax,,,,,,,,0.071448,,,,
80,0.756834,90.98549,"{'balancing:strategy': 'none', 'classifier:__c...",10,Success,0.0,none,extra_trees,feature_type,no_preprocessing,False,gini,,0.957224,,0.0,1.0,10.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,encoding,minority_coalescer,most_frequent,minmax,,,,,,,,0.01,,,,


In [10]:
import pickle

# save model
with open('autosklearn_900.pkl', 'wb') as f:
    pickle.dump(cls, f)

In [2]:
import pickle

# load model and calculate accuracy
with open('autosklearn_900.pkl', 'rb') as f:
    cls = pickle.load(f)

In [8]:
from autosklearn.experimental.askl2 import AutoSklearn2Classifier
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.metrics import accuracy

# Create sklearn pipeline for data preprocessing
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.feature_extraction import FeatureHasher
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer, recall_score, precision_score
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import set_config
import numpy as np
import mlflow
import math
from joblib import Memory
from shutil import rmtree

In [10]:
preprocessor = ColumnTransformer([
        # (name, transformer, columns)
        ("temperatureFirstHalfPlanetRotation", make_pipeline(IterativeImputer(missing_values=-999.0), FunctionTransformer(lambda f: (f - 32) / 1.8, feature_names_out="one-to-one"), RobustScaler()), ['temperatureFirstHalfPlanetRotation']), # convert from Fahrenheit to Celsius
        ("temperatureSecondHalfPlanetRotation", StandardScaler(), ['temperatureSecondHalfPlanetRotation']), # pass through the column unchanged
        ("waterStreamDistanceX", make_pipeline(FunctionTransformer(lambda f: f * 0.3048, feature_names_out="one-to-one"), StandardScaler()), ['waterStreamDistanceX']), # convert from feet to meters
        ("waterStreamDistanceY", StandardScaler(), ['waterStreamDistanceY']), # pass through the column unchanged
        ("planetSection", OneHotEncoder(handle_unknown = "ignore"), ['planetSection']), # one-hot encode the planetSection column
        ("cover", OneHotEncoder(handle_unknown='error', drop='first'), ['cover']), # one-hot encode the cover column and drop the first column (the one with the missing values == 0)
        ("climaticZone", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), ['climaticZone']), # ordinal encode the climaticZone column TODO: drop category 3? what to do? only one row has a 3
        ("geoZone", OneHotEncoder(handle_unknown = "ignore"), ['geoZone']), # one-hot encode the geoZone column TODO: drop category 5?
        ("rockSize", OneHotEncoder(handle_unknown='ignore', drop='first'), ['rockSize']), # one-hot encode the rockSize column and drop the first column (the one with the missing values == 0)
        ("magmaConcentrationDistance", OneHotEncoder(handle_unknown = "ignore"), ['magmaConcentrationDistance']), # one-hot encode the rockSize column and drop the first column (the one with the missing values == 0) TODO: use Ordinal Encoder?
        ("mineralDensity", make_pipeline(IterativeImputer(missing_values=-999.0), RobustScaler()), ['mineralDensity']), # pass through the column unchanged
        ("detectionDepth", StandardScaler(), ['detectionDepth']), # pass through the column unchanged TODO: convert km to m?
        ("longitude", StandardScaler(), ['longitude']), # pass through the column unchanged TODO: values > 360? do x - 360
    ],
    verbose_feature_names_out=False, remainder='passthrough'
)

def euclidean_distance(x, y):
    res = []
    for i in range(len(x)):
        res.append(math.sqrt(x[i]**2 + y[i]**2))
    return np.array(res)

class CreateVariables(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
        
    def fit(self, X, y = None):
        return self

    def transform(self, X, y = None):
        # waterSteamDistance
        waterStreamDistanceX = X[:,2]
        waterStreamDistanceY = X[:,3]
        waterStreamDistance = euclidean_distance(waterStreamDistanceX, waterStreamDistanceY)
        X = np.append(X, waterStreamDistance.reshape(-1, 1), axis=1)

        # temperature (both planet rotations)
        temperatureFirstHalfPlanetRotation = X[:,0]
        temperatureSecondHalfPlanetRotation = X[:,1]
        meanTemperature = (temperatureFirstHalfPlanetRotation + temperatureSecondHalfPlanetRotation)/2
        X = np.append(X, meanTemperature.reshape(-1, 1), axis=1)

        return X

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('create_variables', CreateVariables()),
    # ('pca', PCA()),
    # ('model', AutoSklearn2Classifier(n_jobs=-1, metric=accuracy, time_left_for_this_task=60,
    #             # include={
    #             #     'data_preprocessor': ['NoPreprocessing']
    #             # }
    #          )
    # )
])

cls.score(pipe.fit_transform(X), y)

1.0

-------------------------

In [88]:
train = pd.read_csv('../data/train_data.csv')
test = pd.read_csv('../data/test_data.csv')

In [89]:
X = train.drop(['mineralType', 'id'], axis=1)
y = train['mineralType']

In [49]:
X['planetSection'] = X['planetSection'].astype('category')
X['geoZone'] = X['geoZone'].astype('category')
X['rockSize'] = X['rockSize'].astype('category')
X['cover'] = X['cover'].astype('category')
X['magmaConcentrationDistance'] = X['magmaConcentrationDistance'].astype('category')

In [50]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2)

In [12]:
cls2 = AutoSklearn2Classifier(n_jobs=-1, metric=accuracy, time_left_for_this_task=300)
cls2.fit(pipe.fit_transform(X_train), y_train)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 40777 instead
  f"Port {expected} is already in use.\n"




AutoSklearn2Classifier(metric=accuracy, n_jobs=-1, per_run_time_limit=720,
                       time_left_for_this_task=300)

In [36]:
cls2.score(pipe.fit_transform(X_test), y_test)

0.8194059405940594

In [41]:
cls2.score(pipe.fit_transform(X), y)

0.833135250772522

In [51]:
cls2.score(pipe.fit_transform(X_train), y_train)

0.8315174326465927

Prueba sobre el conjunto de datos PARA LA ENTREGA

In [79]:
X_submit = test.drop(['id'], axis=1)

X_submit['planetSection'] = X_submit['planetSection'].astype('category')
X_submit['geoZone'] = X_submit['geoZone'].astype('category')
X_submit['rockSize'] = X_submit['rockSize'].astype('category')
X_submit['cover'] = X_submit['cover'].astype('category')
X_submit['magmaConcentrationDistance'] = X_submit['magmaConcentrationDistance'].astype('category')

In [80]:
len(X_submit.columns)

13

In [81]:
pipe.fit_transform(X_submit).shape

(2000, 28)

In [75]:
cls2.predict(pipe.fit_transform(X_submit))

ValueError: X has 28 features, but ColumnTransformer is expecting 29 features as input.