"La polarización de jugo y fibra de una muestra de caña depende del estrato en el que fue cultivado y su variedad"

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing

# Decision Tree Variedad

In [2]:
dataset = pd.read_csv('dataset_bascula.csv')

In [3]:
data = dataset.drop(columns=['Semana','Mes', 'fecha', 'Grupo', 'Hora','% AR jugo', 'Glucobrix','Acidez', 'Estrato',
                             'Brix jugo', 'Pureza jugo', 'Humedad caña', 'Calidad Caña', 'Brix caña', 'Pza. caña',
                             '% Jugo', 'Rdto. Pol Bascula', '% Fibra caña','ENVIO'])

In [4]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)

In [6]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [7]:
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [8]:
print(classifier.predict(sc.transform([[13.56, 11.04]])))

['CG 9846']


In [9]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[['SP 791287' 'SP 716161']
 ['CG 02-163' 'CP 722086']
 ['CP 722086' 'CG 9810']
 ...
 ['CP 722086' 'CG 9878']
 ['CG 9878' 'CP 722086']
 ['SP 791287' 'SP 716161']]


In [10]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[ 1  0  0 ...  0  1  5]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  1]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 3  0  0 ...  0  7 17]
 [ 6  1  1 ...  0 12 31]]


0.15208070617906683

In [11]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   CG 00-033       0.01      0.02      0.01        59
   CG 00-102       0.00      0.00      0.00         5
    CG 01-53       0.00      0.00      0.00         7
   CG 02-163       0.07      0.09      0.08       540
   CG 03-025       0.00      0.00      0.00         3
 CG 05-12517       0.00      0.00      0.00         1
     CG 9640       0.00      0.00      0.00         4
     CG 9810       0.02      0.03      0.03        90
     CG 9846       0.04      0.04      0.04       268
     CG 9878       0.08      0.08      0.08       343
  CP 01-1341       0.00      0.00      0.00        51
  CP 01-1564       0.00      0.00      0.00         3
   CP 021564       0.00      0.00      0.00         2
   CP 722086       0.33      0.32      0.32      2584
   CP 731547       0.14      0.14      0.14       982
   CP 881165       0.07      0.07      0.07       432
   CP 892143       0.00      0.00      0.00         5
   CP 931017       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


# Decision Tree Estrato

In [12]:
data2 = dataset.drop(columns=['Semana','Mes', 'fecha', 'Grupo', 'Hora','% AR jugo', 'Glucobrix','Acidez', 'Variedad',
                             'Brix jugo', 'Pureza jugo', 'Humedad caña', 'Calidad Caña', 'Brix caña', 'Pza. caña',
                             '% Jugo', 'Rdto. Pol Bascula', '% Fibra caña','ENVIO'])

In [13]:
data2 = data2[['Pol jugo', 'Pol caña', 'Estrato' ]]

In [14]:
data2 = data2.dropna()

In [15]:
X = data2.iloc[:, :-1].values
y = data2.iloc[:, -1].values

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)

In [17]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [18]:
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [19]:
print(classifier.predict(sc.transform([[12.91, 10.62]])))

['ALTO']


In [20]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[['BAJO' 'BAJO']
 ['BAJO' 'BAJO']
 ['BAJO' 'BAJO']
 ...
 ['BAJO' 'BAJO']
 ['MEDIO' 'BAJO']
 ['BAJO' 'BAJO']]


In [21]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[   0   32    7]
 [  38 5668  984]
 [   4  968  163]]


In [22]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        ALTO       0.00      0.00      0.00        39
        BAJO       0.85      0.85      0.85      6690
       MEDIO       0.14      0.14      0.14      1135

    accuracy                           0.74      7864
   macro avg       0.33      0.33      0.33      7864
weighted avg       0.74      0.74      0.74      7864



In [23]:
le = preprocessing.OneHotEncoder()

y_test_enc = le.fit_transform(y_test.reshape(-1, 1))
y_pred_enc = le.fit_transform(y_pred.reshape(-1, 1))

roc_auc_score(y_test_enc.toarray(), y_pred_enc.toarray(), multi_class='ovo')

0.4977363079039938