# Predicción de Litologías con Aprendizaje Automático

Este proyecto tiene como objetivo predecir litologías a partir de datos geológicos usando técnicas de imputación, transformación de señales y modelos de clasificación. A continuación se detallan los pasos realizados y se presenta una demostración funcional del código.


Importación de Librerías

In [19]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import XGBClassifier

import pywt
import pickle
import precond
import wavelet_transform
import feature_augmentation
import validation
import imputation

from datetime import datetime


Carga del Dataset y Estadísticas

In [14]:
start_time = datetime.now()

train = pd.read_csv('train.csv', sep=';')
train_bkp = train.copy()
train.describe()


Unnamed: 0,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,CALI,RSHA,RMED,RDEP,RHOB,GR,...,ROP,DTS,DCAL,DRHO,MUDWEIGHT,RMIC,ROPA,RXO,FORCE_2020_LITHOFACIES_LITHOLOGY,FORCE_2020_LITHOFACIES_CONFIDENCE
count,1170511.0,1159736.0,1159736.0,1159736.0,1082634.0,630650.0,1131518.0,1159496.0,1009242.0,1170511.0,...,535071.0,174613.0,298833.0,987857.0,316151.0,176160.0,192325.0,327427.0,1170511.0,1170332.0
mean,2184.087,485631.0,6681276.0,-2138.527,13.18568,10.694664,4.986978,10.69103,2.284987,70.9137,...,137.367965,204.655019,1.223849,0.012196,1.216329,7.796809,23.505069,-95.779496,61385.98,1.164258
std,997.1821,34556.41,128152.4,970.9426,3.798907,100.642597,54.67269,113.948,0.2532835,34.23149,...,1539.383558,71.068461,54.372859,7.477798,10.180834,89.741807,22.6328,348.98003,13891.7,0.445118
min,136.086,426898.8,6406641.0,-5395.563,2.344,0.0001,-0.008418695,0.03170056,0.7209712,0.1092843,...,-0.117977,69.163177,-12.215459,-7429.338867,0.125818,0.056586,-999.250122,-999.900024,30000.0,1.0
25%,1418.597,454799.6,6591327.0,-2811.502,9.429712,0.85412,0.9140862,0.9102396,2.092203,47.62722,...,5.628,155.936707,0.148438,-0.009253,0.143792,0.891272,11.281895,0.687681,65000.0,1.0
50%,2076.605,476920.3,6737311.0,-2042.785,12.55575,1.39902,1.443584,1.439,2.321228,68.36763,...,17.799999,188.200653,0.55732,0.001752,0.155774,1.967156,20.131153,1.366672,65000.0,1.0
75%,2864.393,520153.2,6784886.0,-1391.866,16.71075,3.099348,2.68093,2.55722,2.48858,89.03551,...,34.812794,224.645081,1.299655,0.021702,1.168307,5.084564,31.138481,3.42079,65000.0,1.0
max,5436.632,572632.8,6856661.0,-111.086,28.279,2193.904541,1988.616,1999.887,3.45782,1076.964,...,47015.125,676.578125,10011.422852,2.836938,185.730927,10000.0,742.797852,35930.671875,99000.0,3.0


## Preprocesamiento e Imputación

Se realiza el preprocesamiento y la imputación de datos faltantes. Se codifican las variables categóricas y se seleccionan las características relevantes.


In [15]:
# Preprocesamiento
# train = precond.precond_train(train)

# Codificación
# le = LabelEncoder()
# le.fit(train['GROUP'])
# train['GROUP'] = le.transform(train['GROUP'])
# pickle.dump(le, open('labelencoder.pkl', 'wb'))

# Reducción de columnas
# train_imp = train.drop(['WELL','FORCE_2020_LITHOFACIES_LITHOLOGY','FORCE_2020_LITHOFACIES_CONFIDENCE'], axis=1)
# train_imp, model_imp_list = imputation.imputer_train(train_imp, "train_imp.csv")
# train_imp['WELL'] = train['WELL']


## Selección de Características, Transformaciones y Aumento

Se seleccionan variables clave, se aplican transformaciones wavelet y se realiza el aumento de características.


In [20]:
features = ['WELL','X_LOC','Y_LOC','Z_LOC','RDEP','GROUP','CALI','GR','RHOB','NPHI','PEF','DTC','SP','DRHO']
# train_imp = train_imp[features]
# train_imp['FORCE_2020_LITHOFACIES_LITHOLOGY'] = train['FORCE_2020_LITHOFACIES_LITHOLOGY']
# train_imp = wavelet_transform.wavelet_transform(train_imp)

# Eliminación del efecto hombro
# train_imp.loc[train_imp['FORCE_2020_LITHOFACIES_LITHOLOGY'].shift(-1) != train_imp['FORCE_2020_LITHOFACIES_LITHOLOGY'].shift(1), 'FORCE_2020_LITHOFACIES_LITHOLOGY'] = np.nan
# train_imp.dropna(subset=['FORCE_2020_LITHOFACIES_LITHOLOGY'], inplace=True)

# Aumento
# train_imp_aug = train_imp.drop(['WELL','FORCE_2020_LITHOFACIES_LITHOLOGY'], axis=1)
# train_imp_aug = feature_augmentation.feat_aug(train_imp_aug, train_imp['WELL'], train_imp['Z_LOC'])
# train_imp_aug = feature_augmentation.poly_feat(train_imp_aug)


## Entrenamiento del Modelo

Se entrena un modelo de clasificación XGBoost utilizando una división entrenamiento/prueba del conjunto de datos.


In [17]:
# X = train_imp_aug
# y = train_imp['FORCE_2020_LITHOFACIES_LITHOLOGY']
# X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

# model = XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=300, random_state=0)
# model.fit(X_train, y_train)
# print("Model trained")

# accuracy_split_test, pen_rel_split_test = validation.validation(X_test, y_test, train_imp, model, 'penalty_matrix.npy') 
# pickle.dump(model, open('model.pkl', 'wb'))


In [27]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Suponiendo que 'train' es tu DataFrame original
train_features = train.drop(columns=['WELL'])  # O ajusta a tus necesidades
num_cols = train_features.select_dtypes(include='number').columns

# Imputación solo sobre columnas numéricas
imputer = IterativeImputer(random_state=0)
train_imp_num = pd.DataFrame(imputer.fit_transform(train_features[num_cols]), columns=num_cols)

# Combina columnas imputadas con las no numéricas que no necesitan imputación
non_num_cols = train_features.drop(columns=num_cols).reset_index(drop=True)
train_imp = pd.concat([train_imp_num, non_num_cols], axis=1)

# Si quieres guardar para futuros usos:
train_imp.to_csv('train_imp.csv', sep=';', index=False)


## Predicción en Conjunto de Prueba Cerrado

Se carga el conjunto cerrado, se preprocesa y se predicen las litologías utilizando el modelo entrenado.


In [None]:
# ==================== PREDICCIÓN ====================
model = pickle.load(open('model.pkl', 'rb'))
prediction = model.predict(test_imp_aug)

# Refinamiento
test['PREDICTION'] = prediction
test.loc[test['PREDICTION'].shift(-2) == test['PREDICTION'].shift(1), 'PREDICTION'] = test['PREDICTION'].shift(-2)
test.loc[test['PREDICTION'].shift(-1) == test['PREDICTION'].shift(1), 'PREDICTION'] = test['PREDICTION'].shift(-1)
test['PREDICTION'].iloc[-1] = prediction[-1]

# Guardado CSV
np.savetxt('GIR_TEAM_final_submission.csv', test['PREDICTION'].values, header='lithology', comments='', fmt='%i')

# ==================== GUARDAR GRÁFICA ====================
plt.figure(figsize=(4, 12))
plt.scatter(test['PREDICTION'], test['Z_LOC'], c=test['PREDICTION'], cmap='tab20', s=10)
plt.gca().invert_yaxis()
plt.xlabel("Predicción de litología")
plt.ylabel("Profundidad (Z_LOC)")
plt.title("Predicción de Fases Litológicas")
plt.grid(True)
plt.tight_layout()
plt.savefig("prediction_plot.png")
plt.close()

# ==================== FINAL ====================
end_time = datetime.now()
print('Duración total: {}'.format(end_time - start_time))
print("✅ CSV guardado como: GIR_TEAM_final_submission.csv")
print("✅ Imagen guardada como: prediction_plot.png")