# Predizendo a safra atual de 2020

In [1]:
# Importando as bibliotecas
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

Lendo os dados e usando a safra 2018-2019 como dados de treino e a safra 2020 como dados de teste.

In [2]:
df_train = pd.read_csv('Safra_2018-2019.csv')
df_test = pd.read_csv('Safra_2020.csv')

In [3]:
df_train

Unnamed: 0.1,Unnamed: 0,Identificador_Agricultor,Estimativa_de_Insetos,Tipo_de_Cultivo,Tipo_de_Solo,Categoria_Pesticida,Doses_Semana,Semanas_Utilizando,Semanas_Sem_Uso,Temporada,dano_na_plantacao
0,0,SCROP00001,188,1,0,1,0,0.0,0,1,0
1,1,SCROP00002,209,1,0,1,0,0.0,0,2,1
2,2,SCROP00003,257,1,0,1,0,0.0,0,2,1
3,3,SCROP00004,257,1,1,1,0,0.0,0,2,1
4,4,SCROP00005,342,1,0,1,0,0.0,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
79995,79995,SCROP79996,209,0,1,2,20,12.0,6,2,0
79996,79996,SCROP79997,232,0,1,2,20,12.0,9,1,0
79997,79997,SCROP79998,256,0,1,2,10,14.0,10,1,0
79998,79998,SCROP79999,256,0,1,2,10,16.0,8,2,0


In [4]:
df_test

Unnamed: 0.1,Unnamed: 0,Identificador_Agricultor,Estimativa_de_Insetos,Tipo_de_Cultivo,Tipo_de_Solo,Categoria_Pesticida,Doses_Semana,Semanas_Utilizando,Semanas_Sem_Uso,Temporada
0,80000,SCROP80001,256,0,1,2,35,11.0,9,1
1,80001,SCROP80002,283,0,1,2,5,5.0,6,1
2,80002,SCROP80003,283,0,1,2,15,15.0,5,2
3,80003,SCROP80004,283,0,1,2,15,,7,2
4,80004,SCROP80005,283,0,1,2,15,19.0,2,1
...,...,...,...,...,...,...,...,...,...,...
8853,88853,SCROP88854,3337,1,0,2,10,12.0,44,3
8854,88854,SCROP88855,3516,1,0,2,10,20.0,38,1
8855,88855,SCROP88856,3516,1,0,2,15,40.0,8,2
8856,88856,SCROP88857,3702,1,0,2,10,25.0,18,3


Obtendo informação dos dados de treino e teste.

In [5]:
df_train.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                80000 non-null  int64  
 1   Identificador_Agricultor  80000 non-null  object 
 2   Estimativa_de_Insetos     80000 non-null  int64  
 3   Tipo_de_Cultivo           80000 non-null  int64  
 4   Tipo_de_Solo              80000 non-null  int64  
 5   Categoria_Pesticida       80000 non-null  int64  
 6   Doses_Semana              80000 non-null  int64  
 7   Semanas_Utilizando        71945 non-null  float64
 8   Semanas_Sem_Uso           80000 non-null  int64  
 9   Temporada                 80000 non-null  int64  
 10  dano_na_plantacao         80000 non-null  int64  
dtypes: float64(1), int64(9), object(1)
memory usage: 6.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8858 entries, 0 to 8857
Data columns (total 10 columns):
 #   Co

# Pré - Processamento dos dados

Selecionando apenas as features numéricas.

In [6]:
df_train_numeric = df_train.select_dtypes(include=['float64','int64'])
df_test_numeric = df_test.select_dtypes(include=['float64','int64'])

Contando a quantidade de valores nulos nos dados.

In [7]:
print('Valores nulos nos dados de treino: \n')
print(df_train_numeric.isnull().sum())
print('\n')
print('Valores nulos nos dados de teste: \n')
print(df_test_numeric.isnull().sum())

Valores nulos nos dados de treino: 

Unnamed: 0                  0
Estimativa_de_Insetos       0
Tipo_de_Cultivo             0
Tipo_de_Solo                0
Categoria_Pesticida         0
Doses_Semana                0
Semanas_Utilizando       8055
Semanas_Sem_Uso             0
Temporada                   0
dano_na_plantacao           0
dtype: int64


Valores nulos nos dados de teste: 

Unnamed: 0                 0
Estimativa_de_Insetos      0
Tipo_de_Cultivo            0
Tipo_de_Solo               0
Categoria_Pesticida        0
Doses_Semana               0
Semanas_Utilizando       945
Semanas_Sem_Uso            0
Temporada                  0
dtype: int64


Trocando os valores nulos pelo valor médio da coluna e checando se ainda há valores nulos.

In [8]:
df_train_numeric["Semanas_Utilizando"] = df_train_numeric["Semanas_Utilizando"].fillna(df_train_numeric["Semanas_Utilizando"].mean())
print('Valores nulos nos dados de treino: \n')
print(df_train_numeric.isnull().sum())

Valores nulos nos dados de treino: 

Unnamed: 0               0
Estimativa_de_Insetos    0
Tipo_de_Cultivo          0
Tipo_de_Solo             0
Categoria_Pesticida      0
Doses_Semana             0
Semanas_Utilizando       0
Semanas_Sem_Uso          0
Temporada                0
dano_na_plantacao        0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Fazendo o mesmo para os dados de teste.

In [9]:
df_test_numeric["Semanas_Utilizando"] = df_test_numeric["Semanas_Utilizando"].fillna(df_test_numeric["Semanas_Utilizando"].mean())
print('Valores nulos nos dados de teste: \n')
print(df_test_numeric.isnull().sum())

Valores nulos nos dados de teste: 

Unnamed: 0               0
Estimativa_de_Insetos    0
Tipo_de_Cultivo          0
Tipo_de_Solo             0
Categoria_Pesticida      0
Doses_Semana             0
Semanas_Utilizando       0
Semanas_Sem_Uso          0
Temporada                0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Obsevrvando a correlação dos dados.

In [10]:
dados_corr = df_train_numeric.corr()
dados_corr

Unnamed: 0.1,Unnamed: 0,Estimativa_de_Insetos,Tipo_de_Cultivo,Tipo_de_Solo,Categoria_Pesticida,Doses_Semana,Semanas_Utilizando,Semanas_Sem_Uso,Temporada,dano_na_plantacao
Unnamed: 0,1.0,0.003842,0.002163,-0.003268,-0.00664,-0.004104,-0.004867,0.005664,-0.00471,-0.001463
Estimativa_de_Insetos,0.003842,1.0,-0.090063,-0.056147,-0.080734,-0.101807,0.414074,0.298849,0.002964,0.20329
Tipo_de_Cultivo,0.002163,-0.090063,1.0,-0.278775,0.238823,-0.190884,-0.042678,-0.235928,0.002792,-0.017228
Tipo_de_Solo,-0.003268,-0.056147,-0.278775,1.0,0.035994,0.07658,-0.064923,0.006122,0.002066,-0.021176
Categoria_Pesticida,-0.00664,-0.080734,0.238823,0.035994,1.0,-0.002913,0.326133,-0.556253,0.001175,0.172943
Doses_Semana,-0.004104,-0.101807,-0.190884,0.07658,-0.002913,1.0,0.117466,-0.034144,-0.002251,-0.030476
Semanas_Utilizando,-0.004867,0.414074,-0.042678,-0.064923,0.326133,0.117466,1.0,-0.424653,0.00187,0.221095
Semanas_Sem_Uso,0.005664,0.298849,-0.235928,0.006122,-0.556253,-0.034144,-0.424653,1.0,0.00322,-0.133115
Temporada,-0.00471,0.002964,0.002792,0.002066,0.001175,-0.002251,0.00187,0.00322,1.0,-0.002233
dano_na_plantacao,-0.001463,0.20329,-0.017228,-0.021176,0.172943,-0.030476,0.221095,-0.133115,-0.002233,1.0


Selecionando as features com maiores correlações em relação ao dano na plantação.

In [11]:
dados_corr['dano_na_plantacao'] > 0

Unnamed: 0               False
Estimativa_de_Insetos     True
Tipo_de_Cultivo          False
Tipo_de_Solo             False
Categoria_Pesticida       True
Doses_Semana             False
Semanas_Utilizando        True
Semanas_Sem_Uso          False
Temporada                False
dano_na_plantacao         True
Name: dano_na_plantacao, dtype: bool

Selecionando as features que retornaram true farão parte dos dados de treino e teste.

In [12]:
x_train = df_train_numeric[['Estimativa_de_Insetos', 'Categoria_Pesticida', 'Semanas_Utilizando']]
y_train = df_train_numeric[['dano_na_plantacao']]
x_test = df_test_numeric[['Estimativa_de_Insetos', 'Categoria_Pesticida', 'Semanas_Utilizando']]

Obtendo informação dos tipos dos dados que serão usados.

In [13]:
print('Informações de x_train:')
print(x_train.info(), '\n')
print('Informações de y_train:')
print(y_train.info(), '\n')
print('Informações de x_test:')
print(x_test.info())

Informações de x_train:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Estimativa_de_Insetos  80000 non-null  int64  
 1   Categoria_Pesticida    80000 non-null  int64  
 2   Semanas_Utilizando     80000 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.8 MB
None 

Informações de y_train:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 1 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   dano_na_plantacao  80000 non-null  int64
dtypes: int64(1)
memory usage: 625.1 KB
None 

Informações de x_test:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8858 entries, 0 to 8857
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 

# Treinando os dados de treino :

Dividindo os dados de treino em treino e teste para testar as soluções.

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
# Separando 80% para treino e 20% para teste
X_train, X_test, Y_train, Y_test = train_test_split(x_train,y_train,test_size = 0.2,random_state=1)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(64000, 3)
(16000, 3)
(64000, 1)
(16000, 1)


Para saber qual o melhor modelo, vamos treinar vários tipos e obter qual faz a melhor pontuação.

# Regressão Logística

Implementando o modelo de Regressão Logística.

In [16]:
from sklearn.linear_model import LogisticRegression 

lr = LogisticRegression()

lr.fit(X_train,Y_train)



  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

Obtendo a pontuação do modelo .

In [17]:
print('Logistic Regression Score : ',lr.score(X_test,Y_test))

Logistic Regression Score :  0.8360625


# K-Nearest Neighbors 

Implementando o modelo de KNN.

In [18]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 7)

knn.fit(X_train,Y_train)

  """


KNeighborsClassifier(n_neighbors=7)

In [19]:
print('K-Nearest Neighbors Score : ',knn.score(X_test,Y_test))

K-Nearest Neighbors Score :  0.826625


# Support Vector Machine

Aplicando o modelo de SVM.

In [20]:
from sklearn.svm import SVC

svm = SVC(random_state = 1)

svm.fit(X_train,Y_train)

  return f(**kwargs)


SVC(random_state=1)

In [21]:
print('Super Vector Machine Score : ',svm.score(X_test,Y_test))

Super Vector Machine Score :  0.8360625


# Naive Bayes

Aplicando o modelo de Naive Bayes.

In [22]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

nb.fit(X_train,Y_train)

  return f(**kwargs)


GaussianNB()

In [23]:
print('Naive Bayes Score : ',nb.score(X_test,Y_test))

Naive Bayes Score :  0.822375


# Decision Tree

Aplicando o modelo de Decision Tree.

In [24]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

dt.fit(X_train,Y_train)

DecisionTreeClassifier()

In [25]:
print('Decision Tree Score : ',dt.score(X_test,Y_test))

Decision Tree Score :  0.832


# Random Forest

Aplicando o modelo Random Forest.

In [26]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 22,random_state = 40)

rf.fit(X_train,Y_train)

  """


RandomForestClassifier(n_estimators=22, random_state=40)

In [27]:
print('Random Forest Score : ',rf.score(X_test,Y_test))

Random Forest Score :  0.829875


# Prevendo nossos dados usando o melhor modelo

Vemos que os modelos de Regressão Logística e SVM obtiveram os scores mais altos.
Como o modelo de Regressão Logística faz os cálculos de maneira mais rápida que o SVM, vamos escolher o modelo de Regressão Logística para prever nossos dados.

In [28]:
from sklearn.linear_model import LogisticRegression 

lr = LogisticRegression()

lr.fit(x_train,y_train)

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [29]:
# Prevendo o modelo obtido.
prediction = lr.predict(x_test)

In [30]:
prediction

array([0, 0, 0, ..., 0, 0, 0])

Passando o resultado obtido na coluna 'dano_na_plantacao' e adicionando essa coluna nos dados de teste.

In [31]:
df_resposta = df_test

In [32]:
df_resposta['dano_na_plantacao'] = prediction

Obtendo a previsão da safra de 2020.

In [33]:
df_resposta

Unnamed: 0.1,Unnamed: 0,Identificador_Agricultor,Estimativa_de_Insetos,Tipo_de_Cultivo,Tipo_de_Solo,Categoria_Pesticida,Doses_Semana,Semanas_Utilizando,Semanas_Sem_Uso,Temporada,dano_na_plantacao
0,80000,SCROP80001,256,0,1,2,35,11.0,9,1,0
1,80001,SCROP80002,283,0,1,2,5,5.0,6,1,0
2,80002,SCROP80003,283,0,1,2,15,15.0,5,2,0
3,80003,SCROP80004,283,0,1,2,15,,7,2,0
4,80004,SCROP80005,283,0,1,2,15,19.0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...
8853,88853,SCROP88854,3337,1,0,2,10,12.0,44,3,0
8854,88854,SCROP88855,3516,1,0,2,10,20.0,38,1,0
8855,88855,SCROP88856,3516,1,0,2,15,40.0,8,2,0
8856,88856,SCROP88857,3702,1,0,2,10,25.0,18,3,0


Salvando o dataframe de resposta.

In [34]:
df_resposta.to_csv('answer.csv', index=False, header=True)