In [1]:
# Import modules
import numpy as np # Numeric operations
import pandas as pd # Data manipulation
import matplotlib.pyplot as plt # Plots
import seaborn as sns
import pickle

import warnings
warnings.filterwarnings("ignore")

## Location to models in api.

path='WEB MachineLearning model deployed with Flask/static/modelos/'

In [2]:
# Read the initial dataset
apy = pd.read_csv('apy.csv')
"""
Context
Historical data of Indian agricultural
production on various location acquired from the Indian government web page.
https://data.gov.in""";

In [3]:
# A Look in the dataframe
apy.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165


In [4]:
# Check Shape
apy.shape

(246091, 7)

In [41]:
# Check Variable types
apy.dtypes

State_Name        object
District_Name     object
Crop_Year          int64
Season            object
Crop              object
Area             float64
Production        object
dtype: object

In [6]:
# check for null values
apy.isnull().sum()

State_Name       0
District_Name    0
Crop_Year        0
Season           0
Crop             0
Area             0
Production       0
dtype: int64

In [91]:
apy[apy['Production'] != '='].head(2)

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1


In [7]:
#Years in apy dataset
#sorted((apy.Crop_Year.value_counts().index))

In [8]:
gdp = pd.read_csv('API_NY.GDP.MKTP.CD_DS2_en_csv_v2_247793.csv', skiprows=4)

In [9]:
gdp.head(3)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,Unnamed: 63
0,Aruba,ABW,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,2390503000.0,2549721000.0,2534637000.0,2581564000.0,2649721000.0,2691620000.0,2646927000.0,2700559000.0,,
1,Afghanistan,AFG,GDP (current US$),NY.GDP.MKTP.CD,537777800.0,548888900.0,546666700.0,751111200.0,800000000.0,1006667000.0,...,15856570000.0,17804280000.0,20001620000.0,20561050000.0,20484870000.0,19907110000.0,19362640000.0,20191760000.0,19362970000.0,
2,Angola,AGO,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,83799500000.0,111789700000.0,128052900000.0,136709900000.0,145712200000.0,116193600000.0,101123900000.0,122123800000.0,105751000000.0,


In [10]:
# Locate India
gdp.loc[gdp['Country Name'] == 'India']

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,Unnamed: 63
107,India,IND,GDP (current US$),NY.GDP.MKTP.CD,37029880000.0,39232440000.0,42161480000.0,48421920000.0,56480290000.0,59554860000.0,...,1675615000000.0,1823050000000.0,1827638000000.0,1856722000000.0,2039127000000.0,2103588000000.0,2290432000000.0,2652551000000.0,2726323000000.0,


In [11]:
# Select only 1997-2015
start_year = gdp.columns.get_loc('1997')
end_year = gdp.columns.get_loc('2015')
# India is row 107
gdp_9715 = gdp.iloc[107, start_year:end_year+1]

In [12]:
# Create a Dataframe for gdp
gdp_9715_idx = gdp_9715.index[:]
gdp_dict = {'Year': gdp_9715_idx, 'GDP': gdp_9715}
gdp_df = pd.DataFrame(gdp_dict)
gdp_df = gdp_df.reset_index(drop=True)
gdp_df.head()

Unnamed: 0,Year,GDP
0,1997,415868000000.0
1,1998,421351000000.0
2,1999,458820000000.0
3,2000,468395000000.0
4,2001,485441000000.0


In [13]:
# Check if Crop_Year have all values from 1997 to 2015
apy_crop_year = apy['Crop_Year']
apy_crop_year.unique()

array([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2010, 1997, 1998, 1999,
       2007, 2008, 2009, 2011, 2012, 2013, 2014, 2015], dtype=int64)

In [14]:
# Check Datatypes
gdp_df.dtypes

Year    object
GDP     object
dtype: object

In [15]:
# Convert year to int
gdp_df['Year'] = gdp_df['Year'].astype(int)

In [16]:
# Map all GDP values to the respective year
rename_dict = gdp_df.set_index('Year').to_dict()['GDP']
apy_crop_year = apy_crop_year.replace(rename_dict)
# Create the definitive GDP Dataframe
apy_crop_year_dict = {'GDP': apy_crop_year}
gdp_final = pd.DataFrame(apy_crop_year_dict)

In [17]:
# See the data type
gdp_final['GDP'].dtype

dtype('float64')

In [18]:
# Concatenate the DataFrames
india_crop_gdp_1997_2015 = pd.concat([apy, gdp_final], axis=1, sort=False)

In [19]:
# A look into DataFrame
india_crop_gdp_1997_2015.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production,GDP
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000,468394900000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1,468394900000.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321,468394900000.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641,468394900000.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165,468394900000.0


In [20]:
# Save as .csv
india_crop_gdp_1997_2015.to_csv('india_crop_gdp_1997_2015.csv')

In [96]:
india_crop_gdp_1997_2015= pd.read_csv('india_crop_gdp_1997_2015.csv')
india_crop_gdp_1997_2015.columns

Index(['Unnamed: 0', 'State_Name', 'District_Name', 'Crop_Year', 'Season',
       'Crop', 'Area', 'Production', 'GDP'],
      dtype='object')

In [97]:
# Label Enconding
from sklearn import preprocessing
RelationItens ={}
for f in india_crop_gdp_1997_2015.columns:
    if f == 'Production':
        continue
    if india_crop_gdp_1997_2015[f].dtype =='object': 
        vet = []
        le = preprocessing.LabelEncoder()
        itens = (sorted(list(india_crop_gdp_1997_2015[f].value_counts().index)))
        le.fit(list(india_crop_gdp_1997_2015[f].values))
        itens_edited = le.transform(itens)
        vet.append([[a,b] for a,b in zip (itens,itens_edited)])
        india_crop_gdp_1997_2015[f] = le.transform(list(india_crop_gdp_1997_2015[f].values))
        RelationItens[f]=vet
#del RelationItens['Production']
RelationItens.keys()

dict_keys(['State_Name', 'District_Name', 'Season', 'Crop'])

In [23]:
#for i in RelationItens['Season'][0]:
#    print ('        <option value="{}">{}</option>'.format(i[1],i[0]))

# Now we can train the data!

## For the first algorithm (Classification problem)

In [24]:
# Select variables (Features)
X1 = india_crop_gdp_1997_2015[['Area', 'District_Name', 'Season']]
y1 = india_crop_gdp_1997_2015['Crop']

# Model 1 - Classification (Decision Tree Classifier)

- Algoritmo 1:  Objetivo: descobrir a CROP (Cultivo). A partir de 3 dados aleatórios (AREA, STATE, DISTRICT) inseridos pelo usuário, seja calculado o resultado de CROP

In [25]:
# Split Train and Test
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

x1_train, x1_test, y1_train, y1_test = train_test_split(X1, y1, 
                                                    test_size = 0.15,
                                                   random_state = 42)

model1= DecisionTreeClassifier(random_state=42)
model1.fit(x1_train, y1_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [26]:
# Predict and calculate scores
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

y1_pred = model1.predict(x1_test)
acc = accuracy_score(y1_test, y1_pred)
pre = precision_score(y1_test, y1_pred, average='micro')
rec = recall_score(y1_test, y1_pred, average='micro')
f1 = f1_score(y1_test, y1_pred, average='micro')


print('Accuracy: ', acc)
print('Precision: ', pre)
print('Recall: ', rec)
print('F1-Score: ', f1)

Accuracy:  0.408219103863033
Precision:  0.408219103863033
Recall:  0.408219103863033
F1-Score:  0.408219103863033


In [27]:
#------   Saving the model with pickle -----------------
# Define o nome do arquivo em disco que irá guardar o nosso modelo
filename = 'model_1_FindCrop.sav'
# salva o modelo no disco
pickle.dump(model1, open(path+filename, 'wb'))

# Model 2 - Regression (Decision Tree Regressor)

- Algoritmo 2: Objetivo: descobrir a PRODUCTION (Produção agrícola de um cultivo). A partir de 3 dados aleatórios (AREA, CROP e GDP) inseridos pelo usuário, seja calculado o resultado de PRODUCTION

In [103]:
# Select variables (Features)
india_crop_gdp_1997_2015_2 = india_crop_gdp_1997_2015[india_crop_gdp_1997_2015['Production'] != '=']
x2 = india_crop_gdp_1997_2015_2[['Area', 'Crop', 'GDP']]
y2 = india_crop_gdp_1997_2015_2['Production']

In [104]:
# Split Train and Test
x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, 
                                                    test_size = 0.30,
                                                   random_state = 42)

In [105]:
# Train the Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor

model2 = DecisionTreeRegressor(random_state=42)
model2.fit(x2_train, y2_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=42, splitter='best')

In [106]:
# Define o nome do arquivo em disco que irá guardar o nosso modelo
model_2_product = 'model2_P.sav'
# salva o modelo no disco
pickle.dump(model2, open(path+model_2_product, 'wb'))

In [107]:
# Calculate the metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import median_absolute_error


y2_pred = model2.predict(x2_test)
mae = mean_absolute_error(y2_test, y2_pred)
mse = mean_squared_error(y2_test, y2_pred)
r2 = r2_score(y2_test, y2_pred)
mad = median_absolute_error(y2_test, y2_pred)
print('MAE: ', mae)
print('MSE: ', mse)
print('r2: ', r2)
print('MAD: ', mad)

MAE:  315142.46064639505
MSE:  107231610858959.11
r2:  0.6209694669836765
MAD:  300.0


Metricas:
- MAE (Mean absolute error) represents the difference between the original and predicted values extracted by averaged the absolute difference over the data set.
- MSE (Mean Squared Error) represents the difference between the original and predicted values extracted by squared the average difference over the data set.
- RMSE (Root Mean Squared Error) is the error rate by the square root of MSE.
- R-squared (Coefficient of determination) represents the coefficient of how well the values fit compared to the original values. The value from 0 to 1 interpreted as percentages. The higher the value is, the better the model is.