In [1]:
#importação das bilbiotecas
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.utils import to_categorical
import numpy as np

In [2]:
#importdção dos dados
data = pd.read_csv('soybean.csv')
data

Unnamed: 0,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,germination,...,sclerotia,fruit-pods,fruit-spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots,class
0,october,normal,gt-norm,norm,yes,same-lst-yr,low-areas,pot-severe,none,90-100,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
1,august,normal,gt-norm,norm,yes,same-lst-two-yrs,scattered,severe,fungicide,80-89,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
2,july,normal,gt-norm,norm,yes,same-lst-yr,scattered,severe,fungicide,lt-80,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
3,july,normal,gt-norm,norm,yes,same-lst-yr,scattered,severe,none,80-89,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
4,october,normal,gt-norm,norm,yes,same-lst-two-yrs,scattered,pot-severe,none,lt-80,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
678,april,?,?,?,?,?,upper-areas,?,?,?,...,?,?,?,?,?,?,?,?,?,2-4-d-injury
679,april,lt-normal,?,lt-norm,?,diff-lst-year,scattered,?,?,?,...,?,dna,?,?,?,?,?,?,rotted,herbicide-injury
680,june,lt-normal,?,lt-norm,?,diff-lst-year,scattered,?,?,?,...,?,dna,?,?,?,?,?,?,rotted,herbicide-injury
681,april,lt-normal,?,lt-norm,?,same-lst-yr,whole-field,?,?,?,...,?,dna,?,?,?,?,?,?,rotted,herbicide-injury


In [3]:
#verificação das variaveis presentes em cada coluna
for n in range(len(data.columns)):
    col = data.columns[n]
    print(data.groupby(data[col]).size())
    print('--------------------------------')

date
?              1
april         26
august       131
july         118
june          93
may           75
october       90
september    149
dtype: int64
--------------------------------
plant-stand
?             36
lt-normal    293
normal       354
dtype: int64
--------------------------------
precip
?           38
gt-norm    459
lt-norm     74
norm       112
dtype: int64
--------------------------------
temp
?           30
gt-norm    199
lt-norm     80
norm       374
dtype: int64
--------------------------------
hail
?      121
no     127
yes    435
dtype: int64
--------------------------------
crop-hist
?                    16
diff-lst-year        65
same-lst-sev-yrs    218
same-lst-two-yrs    219
same-lst-yr         165
dtype: int64
--------------------------------
area-damaged
?                1
low-areas      227
scattered      123
upper-areas    145
whole-field    187
dtype: int64
--------------------------------
severity
?             121
minor         195
pot-severe    322
sev

In [4]:
#substituição das variaveis "?" pela moda
for n in range(len(data.columns)):
    col = data.columns[n]
    mode = data[col].mode()[0]
    data.loc[data[col] == '?', col] = mode
    print(data.groupby(data[col]).size())
    print('--------------------------------')

date
april         26
august       131
july         118
june          93
may           75
october       90
september    150
dtype: int64
--------------------------------
plant-stand
lt-normal    293
normal       390
dtype: int64
--------------------------------
precip
gt-norm    497
lt-norm     74
norm       112
dtype: int64
--------------------------------
temp
gt-norm    199
lt-norm     80
norm       404
dtype: int64
--------------------------------
hail
no     127
yes    556
dtype: int64
--------------------------------
crop-hist
diff-lst-year        65
same-lst-sev-yrs    218
same-lst-two-yrs    235
same-lst-yr         165
dtype: int64
--------------------------------
area-damaged
low-areas      228
scattered      123
upper-areas    145
whole-field    187
dtype: int64
--------------------------------
severity
minor         195
pot-severe    443
severe         45
dtype: int64
--------------------------------
seed-tmt
fungicide    222
none         426
other         35
dtype: int64
--

In [5]:
#separação dos dados em features e targets
features = data.iloc[:,:-1].values
target = data.iloc[:,-1].values

In [6]:
#categorização das variaveis features categóricas
le = LabelEncoder()
for n in range(features.shape[1]):
    features[:,n] = le.fit_transform(features[:,n])

In [7]:
#categorização das variaveis target
target = le.fit_transform(target)

In [8]:
#separação dos dados
x_train,x_test,y_train,y_test = train_test_split(features, target, test_size = 0.2, random_state = 42)

In [9]:
#mudança das variaveis targets para o modelo dummy, já que utilizaremos "categorical cross entropy"
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [10]:
#transformação do tipo dos dados features para os mesmos que os targets
x_train = x_train.astype('float64')
x_test = x_test.astype('float64')

In [11]:
print(type(x_train), x_train.dtype)
print(type(y_train), y_train.dtype)

<class 'numpy.ndarray'> float64
<class 'numpy.ndarray'> float64


In [12]:
#criação do modelo onde as camadas serão "empilhadas" em sequência
model = Sequential()

#35 pois é a quantidade de variaveis features que temos
model.add(Input(shape = (35,)))

model.add(Dense(units = 128, activation = 'relu'))
#dropout para evitar overfitting
model.add(Dropout(0.2))
model.add(Dense(units = 64, activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(units = 32, activation = 'relu'))
model.add(Dropout(0.2))
#total de classes é igual a 19 e como temos um caso de multiclasses a função de ativação nos retornará a classe com a maior probabilidade de ser
model.add(Dense(units = 19, activation = 'softmax'))

In [13]:
#compilação do modelo
#otimizador adam para a atualização dos pesos
#função de perda por conta do tipo de dados que queremos prever
#metricas a partir do quanto o modelo irá acertar
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [14]:
#treino do modelo
#epochs = 50 --> 50 rodadas de treino com todos os dados
#batch size = 10 --> os dados serão enviados em lotes de 10 em 10 registros
#validation data para verificar a acuracia do modelo
model.fit(x_train, y_train, epochs = 50, batch_size = 10, validation_data = (x_test,y_test))

Epoch 1/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 14ms/step - accuracy: 0.0972 - loss: 2.9751 - val_accuracy: 0.1971 - val_loss: 2.6468
Epoch 2/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2249 - loss: 2.6394 - val_accuracy: 0.4526 - val_loss: 2.2018
Epoch 3/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.3501 - loss: 2.2630 - val_accuracy: 0.5474 - val_loss: 1.7257
Epoch 4/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5047 - loss: 1.8146 - val_accuracy: 0.5985 - val_loss: 1.3301
Epoch 5/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5626 - loss: 1.5242 - val_accuracy: 0.6350 - val_loss: 1.1413
Epoch 6/50
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.5393 - loss: 1.4335 - val_accuracy: 0.7226 - val_loss: 0.9297
Epoch 7/50
[1m55/55[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x1cfbc4b0bd0>

In [25]:
#teste com valor especifico
test = x_test[50].reshape(1,-1)
test

array([[6., 0., 1., 1., 1., 1., 2., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 2., 3., 0., 0., 0., 1., 0., 3., 3., 1., 0., 0.,
        1., 0., 1.]])

In [19]:
y_test[50]

array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.])

In [27]:
#previsão do valor
forecast = model.predict(test)
position = np.argmax(forecast)
probability = np.max(forecast)
forecast_class = le.classes_[position] 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step   


In [31]:
print(f'A doença prevista para o conjunto de carcteristicas é {forecast_class}(estando na posição {position}) com a probabilidade de {probability:.2f}%')

A doença prevista para o conjunto de carcteristicas é brown-stem-rot(estando na posição 6) com a probabilidade de 0.94%


In [16]:
#salvando o modelo
model.save('DeepLearning.keras')