In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler, KBinsDiscretizer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, make_scorer, recall_score, precision_score
from sklearn.decomposition import PCA
from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.neural_network import MLPClassifier
from autosklearn.experimental.askl2 import AutoSklearn2Classifier
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.metrics import accuracy

import numpy as np
import pandas as pd
import math

import copy

import mlflow

In [2]:
train = pd.read_csv('../data/train_data.csv')
test = pd.read_csv('../data/test_data.csv')

In [3]:
X = train.drop(['mineralType', 'id'], axis=1)
y = train['mineralType']

In [4]:
y = y.drop(y[X['climaticZone']==3].index)
X = X.drop(X[X['climaticZone']==3].index)
y = y.drop(y[X['geoZone']==5].index.values)
X = X.drop(X[X['geoZone']==5].index.values)

In [5]:
X['planetSection'] = X['planetSection'].astype('category')
X['geoZone'] = X['geoZone'].astype('category')
X['rockSize'] = X['rockSize'].astype('category')
X['cover'] = X['cover'].astype('category')
X['magmaConcentrationDistance'] = X['magmaConcentrationDistance'].astype('category')

In [6]:
preprocessor = ColumnTransformer([
        # (name, transformer, columns)
        ("temperatureFirstHalfPlanetRotation", make_pipeline(IterativeImputer(missing_values=-999.0), FunctionTransformer(lambda f: (f - 32) / 1.8, feature_names_out="one-to-one"), RobustScaler()), ['temperatureFirstHalfPlanetRotation']), # convert from Fahrenheit to Celsius
        ("temperatureSecondHalfPlanetRotation", StandardScaler(), ['temperatureSecondHalfPlanetRotation']), # pass through the column unchanged
        ("waterStreamDistanceX", make_pipeline(FunctionTransformer(lambda f: f * 0.3048, feature_names_out="one-to-one"), StandardScaler()), ['waterStreamDistanceX']), # convert from feet to meters
        ("waterStreamDistanceY", StandardScaler(), ['waterStreamDistanceY']), # pass through the column unchanged
        ("planetSection", OneHotEncoder(handle_unknown = "ignore"), ['planetSection']), # one-hot encode the planetSection column
        ("cover", OneHotEncoder(handle_unknown='error', drop='first'), ['cover']), # one-hot encode the cover column and drop the first column (the one with the missing values == 0)
        ("climaticZone", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), ['climaticZone']), # ordinal encode the climaticZone column TODO: drop category 3? what to do? only one row has a 3
        ("geoZone", OneHotEncoder(handle_unknown = "ignore"), ['geoZone']), # one-hot encode the geoZone column TODO: drop category 5?
        ("rockSize", OneHotEncoder(handle_unknown='ignore', drop='first'), ['rockSize']), # one-hot encode the rockSize column and drop the first column (the one with the missing values == 0)
        ("magmaConcentrationDistance", OneHotEncoder(handle_unknown = "ignore"), ['magmaConcentrationDistance']), # one-hot encode the rockSize column and drop the first column (the one with the missing values == 0) TODO: use Ordinal Encoder?
        ("mineralDensity", make_pipeline(IterativeImputer(missing_values=-999.0), RobustScaler()), ['mineralDensity']), # pass through the column unchanged
        ("detectionDepth", StandardScaler(), ['detectionDepth']), # pass through the column unchanged TODO: convert km to m?
        ("longitude", StandardScaler(), ['longitude']), # pass through the column unchanged TODO: values > 360? do x - 360
    ],
    verbose_feature_names_out=False, remainder='passthrough'
)

def euclidean_distance(x, y):
    res = []
    for i in range(len(x)):
        res.append(math.sqrt(x[i]**2 + y[i]**2))
    return np.array(res)

class CreateVariables(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
        
    def fit(self, X, y = None):
        return self

    def transform(self, X, y = None):
        # waterSteamDistance
        waterStreamDistanceX = X[:,2]
        waterStreamDistanceY = X[:,3]
        waterStreamDistance = euclidean_distance(waterStreamDistanceX, waterStreamDistanceY)
        X = np.append(X, waterStreamDistance.reshape(-1, 1), axis=1)

        # temperature (both planet rotations)
        temperatureFirstHalfPlanetRotation = X[:,0]
        temperatureSecondHalfPlanetRotation = X[:,1]
        meanTemperature = (temperatureFirstHalfPlanetRotation + temperatureSecondHalfPlanetRotation)/2
        X = np.append(X, meanTemperature.reshape(-1, 1), axis=1)

        return X

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('create_variables', CreateVariables())
])

In [9]:
cls = AutoSklearn2Classifier(n_jobs=-1, metric=accuracy, time_left_for_this_task=900)
cls.fit(pipe.fit_transform(X), y)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 42005 instead
  f"Port {expected} is already in use.\n"




AutoSklearn2Classifier(metric=accuracy, n_jobs=-1, per_run_time_limit=2160,
                       time_left_for_this_task=900)

In [10]:
pd.DataFrame(cls.cv_results_).sort_values(by=['rank_test_scores'], ascending=True).head(10)

Unnamed: 0,mean_test_score,mean_fit_time,params,rank_test_scores,status,budgets,param_balancing:strategy,param_classifier:__choice__,param_data_preprocessor:__choice__,param_feature_preprocessor:__choice__,...,param_classifier:mlp:validation_fraction,param_classifier:sgd:epsilon,param_classifier:sgd:eta0,param_classifier:sgd:l1_ratio,param_classifier:sgd:power_t,param_data_preprocessor:feature_type:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction,param_data_preprocessor:feature_type:numerical_transformer:rescaling:quantile_transformer:n_quantiles,param_data_preprocessor:feature_type:numerical_transformer:rescaling:quantile_transformer:output_distribution,param_data_preprocessor:feature_type:numerical_transformer:rescaling:robust_scaler:q_max,param_data_preprocessor:feature_type:numerical_transformer:rescaling:robust_scaler:q_min
126,0.765055,43.01433,"{'balancing:strategy': 'weighting', 'classifie...",1,Success,0.0,weighting,extra_trees,feature_type,no_preprocessing,...,,,,,,0.238223,,,,
139,0.762282,39.30047,"{'balancing:strategy': 'none', 'classifier:__c...",2,Success,0.0,none,extra_trees,feature_type,no_preprocessing,...,,,,,,0.009396,,,0.753079,0.25
120,0.760777,37.219218,"{'balancing:strategy': 'none', 'classifier:__c...",3,Success,0.0,none,extra_trees,feature_type,no_preprocessing,...,,,,,,,,,,
12,0.760301,61.8622,"{'balancing:strategy': 'weighting', 'classifie...",4,Success,0.0,weighting,extra_trees,feature_type,no_preprocessing,...,,,,,,0.003691,,,,
62,0.759113,118.667081,"{'balancing:strategy': 'none', 'classifier:__c...",5,Success,0.0,none,extra_trees,feature_type,no_preprocessing,...,,,,,,,,,,
98,0.759113,53.098153,"{'balancing:strategy': 'weighting', 'classifie...",5,Success,0.0,weighting,extra_trees,feature_type,no_preprocessing,...,,,,,,0.000143,,,,
49,0.757448,74.812774,"{'balancing:strategy': 'weighting', 'classifie...",7,Success,0.0,weighting,extra_trees,feature_type,no_preprocessing,...,,,,,,0.000117,,,,
143,0.757211,37.294222,"{'balancing:strategy': 'weighting', 'classifie...",8,Success,0.0,weighting,extra_trees,feature_type,no_preprocessing,...,,,,,,0.145353,,,,
84,0.756101,83.721543,"{'balancing:strategy': 'weighting', 'classifie...",9,Success,0.0,weighting,extra_trees,feature_type,no_preprocessing,...,,,,,,0.008557,,,0.891416,0.171068
138,0.755784,58.697212,"{'balancing:strategy': 'weighting', 'classifie...",10,Success,0.0,weighting,extra_trees,feature_type,no_preprocessing,...,,,,,,,,,,


In [11]:
import pickle

# save model
with open('autosklearn_dropped.pkl', 'wb') as f:
    pickle.dump(cls, f)

In [12]:
X_submit = test.drop(['id'], axis=1)

X_submit['planetSection'] = X_submit['planetSection'].astype('category')
X_submit['geoZone'] = X_submit['geoZone'].astype('category')
X_submit['rockSize'] = X_submit['rockSize'].astype('category')
X_submit['cover'] = X_submit['cover'].astype('category')
X_submit['magmaConcentrationDistance'] = X_submit['magmaConcentrationDistance'].astype('category')

In [14]:
y_hat = cls.predict(pipe.fit_transform(X_submit))
y_hat

array([2, 6, 7, ..., 6, 6, 4])

In [16]:
np.savetxt('submit_dropped_predictions.txt', [y_hat], delimiter=',', fmt='%d')