# Predicción de Litologías con Aprendizaje Automático

Este proyecto tiene como objetivo predecir litologías a partir de datos geológicos usando técnicas de imputación, transformación de señales y modelos de clasificación. A continuación se detallan los pasos realizados y se presenta una demostración funcional del código.


Importación de Librerías

In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import XGBClassifier

import pywt
import pickle
import precond
import wavelet_transform
import feature_augmentation
import validation
import imputation

from datetime import datetime


ModuleNotFoundError: No module named 'xgboost'

Carga del Dataset y Estadísticas

In [None]:
start_time = datetime.now()

train = pd.read_csv('train.csv', sep=';')
train_bkp = train.copy()
train.describe()


## Preprocesamiento e Imputación

Se realiza el preprocesamiento y la imputación de datos faltantes. Se codifican las variables categóricas y se seleccionan las características relevantes.


In [None]:
# Preprocesamiento
# train = precond.precond_train(train)

# Codificación
# le = LabelEncoder()
# le.fit(train['GROUP'])
# train['GROUP'] = le.transform(train['GROUP'])
# pickle.dump(le, open('labelencoder.pkl', 'wb'))

# Reducción de columnas
# train_imp = train.drop(['WELL','FORCE_2020_LITHOFACIES_LITHOLOGY','FORCE_2020_LITHOFACIES_CONFIDENCE'], axis=1)
# train_imp, model_imp_list = imputation.imputer_train(train_imp, "train_imp.csv")
# train_imp['WELL'] = train['WELL']


## Selección de Características, Transformaciones y Aumento

Se seleccionan variables clave, se aplican transformaciones wavelet y se realiza el aumento de características.


In [None]:
features = ['WELL','X_LOC','Y_LOC','Z_LOC','RDEP','GROUP','CALI','GR','RHOB','NPHI','PEF','DTC','SP','DRHO']
# train_imp = train_imp[features]
# train_imp['FORCE_2020_LITHOFACIES_LITHOLOGY'] = train['FORCE_2020_LITHOFACIES_LITHOLOGY']
# train_imp = wavelet_transform.wavelet_transform(train_imp)

# Eliminación del efecto hombro
# train_imp.loc[train_imp['FORCE_2020_LITHOFACIES_LITHOLOGY'].shift(-1) != train_imp['FORCE_2020_LITHOFACIES_LITHOLOGY'].shift(1), 'FORCE_2020_LITHOFACIES_LITHOLOGY'] = np.nan
# train_imp.dropna(subset=['FORCE_2020_LITHOFACIES_LITHOLOGY'], inplace=True)

# Aumento
# train_imp_aug = train_imp.drop(['WELL','FORCE_2020_LITHOFACIES_LITHOLOGY'], axis=1)
# train_imp_aug = feature_augmentation.feat_aug(train_imp_aug, train_imp['WELL'], train_imp['Z_LOC'])
# train_imp_aug = feature_augmentation.poly_feat(train_imp_aug)


## Entrenamiento del Modelo

Se entrena un modelo de clasificación XGBoost utilizando una división entrenamiento/prueba del conjunto de datos.


In [None]:
# X = train_imp_aug
# y = train_imp['FORCE_2020_LITHOFACIES_LITHOLOGY']
# X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

# model = XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=300, random_state=0)
# model.fit(X_train, y_train)
# print("Model trained")

# accuracy_split_test, pen_rel_split_test = validation.validation(X_test, y_test, train_imp, model, 'penalty_matrix.npy') 
# pickle.dump(model, open('model.pkl', 'wb'))


## Predicción en Conjunto de Prueba Cerrado

Se carga el conjunto cerrado, se preprocesa y se predicen las litologías utilizando el modelo entrenado.


In [None]:
test = pd.read_csv('test.csv', sep=';')
test_bkp = test.copy()

test = precond.precond_test(test)

le = pickle.load(open('labelencoder.pkl', 'rb'))
test['GROUP'] = le.transform(test['GROUP'])

test_imp = test.drop(['WELL'], axis=1)

train_imp_csv = pd.read_csv('train_imp.csv', sep=';')
test_imp = imputation.imputer_test(test_imp, train_imp_csv)

test_imp['WELL'] = test['WELL']
test_imp = test_imp[features]
test_imp = wavelet_transform.wavelet_transform(test_imp)

test_imp_aug = test_imp.drop(['WELL'], axis=1)
test_imp_aug = feature_augmentation.feat_aug(test_imp_aug, test_imp['WELL'], test_imp['Z_LOC'])
test_imp_aug = feature_augmentation.poly_feat(test_imp_aug)

model = pickle.load(open('model.pkl', 'rb'))
prediction = model.predict(test_imp_aug)

test['PREDICTION'] = prediction
test.loc[test['PREDICTION'].shift(-2) == test['PREDICTION'].shift(1), 'PREDICTION'] = test['PREDICTION'].shift(-2)
test.loc[test['PREDICTION'].shift(-1) == test['PREDICTION'].shift(1), 'PREDICTION'] = test['PREDICTION'].shift(-1)
test['PREDICTION'].iloc[-1] = prediction[-1]

np.savetxt('GIR_TEAM_final_submission.csv', test['PREDICTION'].values, header='lithology', comments='', fmt='%i')

end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
