# Procesamiento
Vamos a utilizar sklearn para el procesamiento de datos mediante pipelines.
- Buena info de cómo hacer esto aquí: https://www.youtube.com/watch?v=0B5eIE_1vpU&t=1227s

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

train = pd.read_csv('../data/train_data.csv')
test = pd.read_csv('../data/test_data.csv')

In [None]:
# Vamos a definir los datos de entrenamiento
X = train.drop(['mineralType', 'id'], axis=1)
y = train['mineralType']

In [None]:
# Create sklearn pipeline for data preprocessing
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, OrdinalEncoder, StandardScaler, RobustScaler
from sklearn.feature_extraction import FeatureHasher
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer, recall_score, precision_score
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import set_config
import numpy as np
import mlflow
import math
from joblib import Memory
from shutil import rmtree

mlflow.sklearn.autolog()
mlflow.set_experiment("knnclassifier")

preprocessor = ColumnTransformer([
        # (name, transformer, columns)
        ("temperatureFirstHalfPlanetRotation", make_pipeline(IterativeImputer(missing_values=-999.0), FunctionTransformer(lambda f: (f - 32) / 1.8, feature_names_out="one-to-one"), RobustScaler()), ['temperatureFirstHalfPlanetRotation']), # convert from Fahrenheit to Celsius
        ("temperatureSecondHalfPlanetRotation", StandardScaler(), ['temperatureSecondHalfPlanetRotation']), # pass through the column unchanged
        ("waterStreamDistanceX", make_pipeline(FunctionTransformer(lambda f: f * 0.3048, feature_names_out="one-to-one"), StandardScaler()), ['waterStreamDistanceX']), # convert from feet to meters
        ("waterStreamDistanceY", StandardScaler(), ['waterStreamDistanceY']), # pass through the column unchanged
        ("planetSection", OneHotEncoder(handle_unknown = "ignore"), ['planetSection']), # one-hot encode the planetSection column
        ("cover", OneHotEncoder(handle_unknown='error', drop='first'), ['cover']), # one-hot encode the cover column and drop the first column (the one with the missing values == 0)
        ("climaticZone", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), ['climaticZone']), # ordinal encode the climaticZone column TODO: drop category 3? what to do? only one row has a 3
        ("geoZone", OneHotEncoder(handle_unknown = "ignore"), ['geoZone']), # one-hot encode the geoZone column TODO: drop category 5?
        ("rockSize", OneHotEncoder(handle_unknown='ignore', drop='first'), ['rockSize']), # one-hot encode the rockSize column and drop the first column (the one with the missing values == 0)
        ("magmaConcentrationDistance", OneHotEncoder(handle_unknown = "ignore"), ['magmaConcentrationDistance']), # one-hot encode the rockSize column and drop the first column (the one with the missing values == 0) TODO: use Ordinal Encoder?
        ("mineralDensity", make_pipeline(IterativeImputer(missing_values=-999.0), RobustScaler()), ['mineralDensity']), # pass through the column unchanged
        ("detectionDepth", StandardScaler(), ['detectionDepth']), # pass through the column unchanged TODO: convert km to m?
        ("longitude", StandardScaler(), ['longitude']), # pass through the column unchanged TODO: values > 360? do x - 360
    ],
    verbose_feature_names_out=False, remainder='passthrough'
)

def euclidean_distance(x, y):
    res = []
    for i in range(len(x)):
        res.append(math.sqrt(x[i]**2 + y[i]**2))
    return np.array(res)

class CreateVariables(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
        
    def fit(self, X, y = None):
        return self

    def transform(self, X, y = None):
        # waterSteamDistance
        waterStreamDistanceX = X[:,2]
        waterStreamDistanceY = X[:,3]
        waterStreamDistance = euclidean_distance(waterStreamDistanceX, waterStreamDistanceY)
        X = np.append(X, waterStreamDistance.reshape(-1, 1), axis=1)

        # temperature (both planet rotations)
        temperatureFirstHalfPlanetRotation = X[:,0]
        temperatureSecondHalfPlanetRotation = X[:,1]
        meanTemperature = (temperatureFirstHalfPlanetRotation + temperatureSecondHalfPlanetRotation)/2
        X = np.append(X, meanTemperature.reshape(-1, 1), axis=1)

        return X

# model = MLPClassifier(random_state=1, max_iter=300)
model = KNeighborsClassifier()
# model = RandomForestClassifier(bootstrap=False, max_features=5, min_samples_leaf=15,
#                          n_estimators=512, n_jobs=-1, random_state=1,
#                          warm_start=True)

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('create_variables', CreateVariables()),
    ('pca', PCA()),
    ('model', 'passthrough')
])

param_grid = [
    {
        # 'preprocessor__temperatureFirstHalfPlanetRotation__robustscaler': [StandardScaler(), RobustScaler()],
        'pca__n_components': [None],
        'model': [KNeighborsClassifier()],
        'model__n_neighbors': [7],
        'model__weights': ['distance'],
        'model__metric': ['manhattan'],
    },
    # {
    #     # 'preprocessor__temperatureFirstHalfPlanetRotation__robustscaler': [StandardScaler(), RobustScaler()],
    #     'pca__n_components': [None],
    #     'model': [GradientBoostingClassifier()],
    # },
]

grid = GridSearchCV(pipe, cv=5, scoring=['accuracy', 'precision_macro'], error_score='raise', return_train_score=True, n_jobs=-1, verbose=4, refit='accuracy',
        param_grid=param_grid
        )

# Train the model
grid.fit(X, y)

set_config(display='diagram')
# grid.get_para ms()

In [None]:
# Plot the results of the grid search to see which parameters are the best for the model to use
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
# Sort grid results by ranking
grid_results_df = pd.DataFrame(grid.cv_results_)
grid_results_df.sort_values(by=['rank_test_accuracy'], ascending=False, inplace=True)
plt.plot(grid.cv_results_['mean_test_accuracy'], label='mean test accuracy')
plt.plot(grid.cv_results_['mean_test_precision_macro'], label='mean test precision macro')
plt.legend()
plt.show()

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
# Results by ranking
grid_results_df.sort_values(by=['rank_test_accuracy'], ascending=True, inplace=True)
grid_results_df

In [None]:
preprocessed_data = preprocessor.fit_transform(X)

preprocessed_dataframe = pd.DataFrame(preprocessed_data, columns=preprocessor.get_feature_names_out())
preprocessed_dataframe

In [None]:
train['mineralDensity'].hist(bins=100)

In [None]:
preprocessed_dataframe['mineralDensity'].hist(bins=100)

In [None]:
train['temperatureFirstHalfPlanetRotation'].hist(bins=100)

In [None]:
preprocessed_dataframe['temperatureFirstHalfPlanetRotation'].hist(bins=100)

In [None]:
grid.best_estimator_.score(X, y)

In [None]:
# preprocessor = ColumnTransformer([
#         # (name, transformer, columns)
#         ("temperatureFirstHalfPlanetRotation", make_pipeline(IterativeImputer(missing_values=-999.0), FunctionTransformer(lambda f: (f - 32) / 1.8, feature_names_out="one-to-one"), RobustScaler()), ['temperatureFirstHalfPlanetRotation']), # convert from Fahrenheit to Celsius
#         ("temperatureSecondHalfPlanetRotation", StandardScaler(), ['temperatureSecondHalfPlanetRotation']), # pass through the column unchanged
#         ("waterStreamDistanceX", make_pipeline(FunctionTransformer(lambda f: f * 0.3048, feature_names_out="one-to-one"), StandardScaler()), ['waterStreamDistanceX']), # convert from feet to meters
#         ("waterStreamDistanceY", StandardScaler(), ['waterStreamDistanceY']), # pass through the column unchanged
#         ("planetSection", OneHotEncoder(handle_unknown = "ignore"), ['planetSection']), # one-hot encode the planetSection column
#         ("cover", OneHotEncoder(handle_unknown='error', drop='first'), ['cover']), # one-hot encode the cover column and drop the first column (the one with the missing values == 0)
#         ("climaticZone", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), ['climaticZone']), # ordinal encode the climaticZone column TODO: drop category 3? what to do? only one row has a 3
#         ("geoZone", OneHotEncoder(handle_unknown = "ignore"), ['geoZone']), # one-hot encode the geoZone column TODO: drop category 5?
#         ("rockSize", OneHotEncoder(handle_unknown='ignore', drop='first'), ['rockSize']), # one-hot encode the rockSize column and drop the first column (the one with the missing values == 0)
#         ("magmaConcentrationDistance", OneHotEncoder(handle_unknown = "ignore"), ['magmaConcentrationDistance']), # one-hot encode the rockSize column and drop the first column (the one with the missing values == 0) TODO: use Ordinal Encoder?
#         ("mineralDensity", make_pipeline(IterativeImputer(missing_values=-999.0), RobustScaler()), ['mineralDensity']), # pass through the column unchanged
#         ("detectionDepth", StandardScaler(), ['detectionDepth']), # pass through the column unchanged TODO: convert km to m?
#         ("longitude", StandardScaler(), ['longitude']), # pass through the column unchanged TODO: values > 360? do x - 360
#     ],
#     verbose_feature_names_out=False, remainder='passthrough'
# )

# def euclidean_distance(x, y):
#     res = []
#     for i in range(len(x)):
#         res.append(math.sqrt(x[i]**2 + y[i]**2))
#     return np.array(res)

# class CreateVariables(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         pass
        
#     def fit(self, X, y = None):
#         return self

#     def transform(self, X, y = None):
#         # waterSteamDistance
#         waterStreamDistanceX = X[:,2]
#         waterStreamDistanceY = X[:,3]
#         waterStreamDistance = euclidean_distance(waterStreamDistanceX, waterStreamDistanceY)
#         X = np.append(X, waterStreamDistance.reshape(-1, 1), axis=1)

#         # temperature (both planet rotations)
#         temperatureFirstHalfPlanetRotation = X[:,0]
#         temperatureSecondHalfPlanetRotation = X[:,1]
#         meanTemperature = (temperatureFirstHalfPlanetRotation + temperatureSecondHalfPlanetRotation)/2
#         X = np.append(X, meanTemperature.reshape(-1, 1), axis=1)

#         return X

# pipe = Pipeline([
#     ('preprocessor', preprocessor),
#     ('create_variables', CreateVariables()),
# ])

# preprocessed_data = pipe.fit_transform(X, y)
# reduced_data = PCA(n_components=2).fit_transform(preprocessed_data, y)

# print(preprocessed_data.shape)
# print(reduced_data.shape)

# # reduced_data

# # # Plot the PCA results in a scatter plot with the color of the mineral density with a legend
# plt.figure(figsize=(10, 6))
# plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=y, cmap='viridis')
# plt.xlabel('PC1')
# plt.ylabel('PC2')
# plt.title('PCA of the data')
# plt.legend()
# plt.show()

In [None]:
# preprocessor = ColumnTransformer([
#         # (name, transformer, columns)
#         ("temperatureFirstHalfPlanetRotation", make_pipeline(IterativeImputer(missing_values=-999.0), FunctionTransformer(lambda f: (f - 32) / 1.8, feature_names_out="one-to-one"), RobustScaler()), ['temperatureFirstHalfPlanetRotation']), # convert from Fahrenheit to Celsius
#         ("temperatureSecondHalfPlanetRotation", StandardScaler(), ['temperatureSecondHalfPlanetRotation']), # pass through the column unchanged
#         ("waterStreamDistanceX", make_pipeline(FunctionTransformer(lambda f: f * 0.3048, feature_names_out="one-to-one"), StandardScaler()), ['waterStreamDistanceX']), # convert from feet to meters
#         ("waterStreamDistanceY", StandardScaler(), ['waterStreamDistanceY']), # pass through the column unchanged
#         ("planetSection", OneHotEncoder(handle_unknown = "ignore"), ['planetSection']), # one-hot encode the planetSection column
#         ("cover", OneHotEncoder(handle_unknown='error', drop='first'), ['cover']), # one-hot encode the cover column and drop the first column (the one with the missing values == 0)
#         ("climaticZone", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), ['climaticZone']), # ordinal encode the climaticZone column TODO: drop category 3? what to do? only one row has a 3
#         ("geoZone", OneHotEncoder(handle_unknown = "ignore"), ['geoZone']), # one-hot encode the geoZone column TODO: drop category 5?
#         ("rockSize", OneHotEncoder(handle_unknown='ignore', drop='first'), ['rockSize']), # one-hot encode the rockSize column and drop the first column (the one with the missing values == 0)
#         ("magmaConcentrationDistance", OneHotEncoder(handle_unknown = "ignore"), ['magmaConcentrationDistance']), # one-hot encode the rockSize column and drop the first column (the one with the missing values == 0) TODO: use Ordinal Encoder?
#         ("mineralDensity", make_pipeline(IterativeImputer(missing_values=-999.0), RobustScaler()), ['mineralDensity']), # pass through the column unchanged
#         ("detectionDepth", StandardScaler(), ['detectionDepth']), # pass through the column unchanged TODO: convert km to m?
#         ("longitude", StandardScaler(), ['longitude']), # pass through the column unchanged TODO: values > 360? do x - 360
#     ],
#     verbose_feature_names_out=False, remainder='passthrough'
# )

# def euclidean_distance(x, y):
#     res = []
#     for i in range(len(x)):
#         res.append(math.sqrt(x[i]**2 + y[i]**2))
#     return np.array(res)

# class CreateVariables(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         pass
        
#     def fit(self, X, y = None):
#         return self

#     def transform(self, X, y = None):
#         # waterSteamDistance
#         waterStreamDistanceX = X[:,2]
#         waterStreamDistanceY = X[:,3]
#         waterStreamDistance = euclidean_distance(waterStreamDistanceX, waterStreamDistanceY)
#         X = np.append(X, waterStreamDistance.reshape(-1, 1), axis=1)

#         # temperature (both planet rotations)
#         temperatureFirstHalfPlanetRotation = X[:,0]
#         temperatureSecondHalfPlanetRotation = X[:,1]
#         meanTemperature = (temperatureFirstHalfPlanetRotation + temperatureSecondHalfPlanetRotation)/2
#         X = np.append(X, meanTemperature.reshape(-1, 1), axis=1)

#         return X

# pipe = Pipeline([
#     ('preprocessor', preprocessor),
#     ('create_variables', CreateVariables())
# ])

# preprocessed_data = pipe.fit_transform(X, y)
# reduced_data = PCA(n_components=3).fit_transform(preprocessed_data, y)

# print(preprocessed_data.shape)
# print(reduced_data.shape)

# # reduced_data

# # Create a dataframe with the reduced data and the labels
# reduced_data_df = pd.DataFrame(reduced_data, columns=['PC1', 'PC2', 'PC3'])
# reduced_data_df['mineralDensity'] = y

# # Create an interactive 3D scatter plot of the PCA results with the color of the mineral density with a legend in plotly express
# import plotly.express as px

# fig = px.scatter_3d(
#     reduced_data_df,
#     x='PC1',
#     y='PC2',
#     z='PC3',
#     color='mineralDensity',
#     color_continuous_scale=px.colors.sequential.Viridis,
#     opacity=0.8,
#     title='PCA of the data'
# )
# fig.show()