# Modelo de predicción de calidad de mineral de hierro en un proceso de minería

> ### El objetivo del modelo es predecir la concentración de sílice en el mineral al final de un proceso de extracción para saber su pureza

### Importo las librerías

In [182]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import re
import time
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

### Leo el dataset original y lo exploro

In [2]:
Data = pd.read_csv("../data/MiningProcess_Flotation_Plant_Database.csv", index_col="Unnamed: 0")

In [3]:
Data.head(7)

Unnamed: 0,% Iron Feed,% Silica Feed,Starch Flow,Amina Flow,Ore Pulp Flow,Ore Pulp pH,Ore Pulp Density,Flotation Column 01 Air Flow,Flotation Column 02 Air Flow,Flotation Column 03 Air Flow,...,Flotation Column 07 Air Flow,Flotation Column 01 Level,Flotation Column 02 Level,Flotation Column 03 Level,Flotation Column 04 Level,Flotation Column 05 Level,Flotation Column 06 Level,Flotation Column 07 Level,% Iron Concentrate,% Silica Concentrate
0,55.2,16.98,3019.53,557.434,395.713,10.0664,1.74,249.214,253.235,250.576,...,250.884,457.396,432.962,424.954,443.558,502.255,446.37,523.344,66.91,1.31
1,55.2,16.98,3024.41,563.965,397.383,10.0672,1.74,249.719,250.532,250.862,...,248.994,451.891,429.56,432.939,448.086,496.363,445.922,498.075,66.91,1.31
2,55.2,16.98,3043.46,568.054,399.668,10.068,1.74,249.741,247.874,250.313,...,248.071,451.24,468.927,434.61,449.688,484.411,447.826,458.567,66.91,1.31
3,55.2,16.98,3047.36,568.665,397.939,10.0689,1.74,249.917,254.487,250.049,...,251.147,452.441,458.165,442.865,446.21,471.411,437.69,427.669,66.91,1.31
4,55.2,16.98,3033.69,558.167,400.254,10.0697,1.74,250.203,252.136,249.895,...,248.928,452.441,452.9,450.523,453.67,462.598,443.682,425.679,66.91,1.31
5,55.2,16.98,3079.1,564.697,396.533,10.0705,1.74,250.73,248.906,249.521,...,251.873,444.384,443.269,460.449,439.92,451.588,433.539,425.458,66.91,1.31
6,55.2,16.98,3127.79,566.467,392.9,10.0713,1.74,250.313,252.202,249.082,...,253.477,446.185,444.571,452.306,431.328,443.548,444.575,431.251,66.91,1.31


In [4]:
Data.tail()

Unnamed: 0,% Iron Feed,% Silica Feed,Starch Flow,Amina Flow,Ore Pulp Flow,Ore Pulp pH,Ore Pulp Density,Flotation Column 01 Air Flow,Flotation Column 02 Air Flow,Flotation Column 03 Air Flow,...,Flotation Column 07 Air Flow,Flotation Column 01 Level,Flotation Column 02 Level,Flotation Column 03 Level,Flotation Column 04 Level,Flotation Column 05 Level,Flotation Column 06 Level,Flotation Column 07 Level,% Iron Concentrate,% Silica Concentrate
737448,49.75,23.2,2710.94,441.052,386.57,9.62129,1.65365,302.344,298.786,299.163,...,313.695,392.16,430.702,872.008,418.725,497.548,446.357,416.892,64.27,1.71
737449,49.75,23.2,2692.01,473.436,384.939,9.62063,1.65352,303.013,301.879,299.487,...,236.7,401.505,404.616,864.409,418.377,506.398,372.995,426.337,64.27,1.71
737450,49.75,23.2,2692.2,500.488,383.496,9.61874,1.65338,303.662,307.397,299.487,...,225.879,408.899,399.316,867.598,419.531,503.414,336.035,433.13,64.27,1.71
737451,49.75,23.2,1164.12,491.548,384.976,9.61686,1.65324,302.55,301.959,298.045,...,308.115,405.107,466.832,876.591,407.299,502.301,340.844,433.966,64.27,1.71
737452,49.75,23.2,1164.12,468.019,384.801,9.61497,1.6531,300.355,292.865,298.625,...,308.115,413.754,514.143,881.323,378.969,500.1,374.354,441.182,64.27,1.71


In [5]:
Data.shape

(737453, 23)

In [6]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 737453 entries, 0 to 737452
Data columns (total 23 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   % Iron Feed                   737453 non-null  float64
 1   % Silica Feed                 737453 non-null  float64
 2   Starch Flow                   737453 non-null  float64
 3   Amina Flow                    737453 non-null  float64
 4   Ore Pulp Flow                 737453 non-null  float64
 5   Ore Pulp pH                   737453 non-null  float64
 6   Ore Pulp Density              737453 non-null  float64
 7   Flotation Column 01 Air Flow  737453 non-null  float64
 8   Flotation Column 02 Air Flow  737453 non-null  float64
 9   Flotation Column 03 Air Flow  737453 non-null  float64
 10  Flotation Column 04 Air Flow  737453 non-null  float64
 11  Flotation Column 05 Air Flow  737453 non-null  float64
 12  Flotation Column 06 Air Flow  737453 non-nul

In [7]:
Data.describe()

Unnamed: 0,% Iron Feed,% Silica Feed,Starch Flow,Amina Flow,Ore Pulp Flow,Ore Pulp pH,Ore Pulp Density,Flotation Column 01 Air Flow,Flotation Column 02 Air Flow,Flotation Column 03 Air Flow,...,Flotation Column 07 Air Flow,Flotation Column 01 Level,Flotation Column 02 Level,Flotation Column 03 Level,Flotation Column 04 Level,Flotation Column 05 Level,Flotation Column 06 Level,Flotation Column 07 Level,% Iron Concentrate,% Silica Concentrate
count,737453.0,737453.0,737453.0,737453.0,737453.0,737453.0,737453.0,737453.0,737453.0,737453.0,...,737453.0,737453.0,737453.0,737453.0,737453.0,737453.0,737453.0,737453.0,737453.0,737453.0
mean,56.294739,14.651716,2869.140569,488.144697,397.578372,9.767639,1.68038,280.151856,277.159965,281.082397,...,290.754856,520.244823,522.649555,531.352662,420.320973,425.251706,429.941018,421.021231,65.050068,2.326763
std,5.157744,6.807439,1215.203734,91.230534,9.699785,0.387007,0.069249,29.621288,30.149357,28.558268,...,28.670105,131.014924,128.16505,150.842164,91.794432,84.535822,89.862225,84.891491,1.118645,1.125554
min,42.74,1.31,0.002026,241.669,376.249,8.75334,1.51982,175.51,175.156,176.469,...,185.962,149.218,210.752,126.255,162.201,166.991,155.841,175.349,62.05,0.6
25%,52.67,8.94,2076.32,431.796,394.264,9.52736,1.64731,250.281,250.457,250.855,...,256.302,416.978,441.883,411.325,356.679,357.653,358.497,356.772,64.37,1.44
50%,56.08,13.85,3018.43,504.393,399.249,9.7981,1.6976,299.344,296.223,298.696,...,299.011,491.878,495.956,494.318,411.974,408.773,424.664575,411.065,65.21,2.0
75%,59.72,19.6,3727.73,553.257,402.968,10.038,1.72833,300.149,300.69,300.382,...,301.904,594.114,595.464,601.249,485.549,484.329,492.684,476.465,65.86,3.01
max,65.78,33.4,6300.23,739.538,418.641,10.8081,1.85325,373.871,375.992,364.346,...,371.593,862.274,828.919,886.822,680.359,675.644,698.861,659.902,68.01,5.53


In [8]:
Data.drop("date", axis = 1, inplace=True)

> ### convierto los datos a tipo numérico

In [9]:
for x in range(len(Data["% Silica Concentrate"])):
        Data["% Silica Concentrate"][x] = Data["% Silica Concentrate"][x].replace(",", ".")
        if x in [1000, 10000, 100000, 200000, 300000, 400000, 500000, 600000, 700000]:
            print(x)

In [10]:
Data = Data.astype("float")

### Creo un nuevo archivo CSV para no tener que repetir el anterior proceso

In [11]:
Data.to_csv ("../data/Dataset_definitivo.csv")