In [35]:
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [36]:
col_names = ('State_Name','District_Name','Crop_Year','Season','Crop','Area','Production')

data=pd.read_csv("apy.csv")
data.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


In [37]:
data.describe()

Unnamed: 0,Crop_Year,Area,Production
count,246091.0,246091.0,242361.0
mean,2005.643018,12002.82,582503.4
std,4.952164,50523.4,17065810.0
min,1997.0,0.04,0.0
25%,2002.0,80.0,88.0
50%,2006.0,582.0,729.0
75%,2010.0,4392.0,7023.0
max,2015.0,8580100.0,1250800000.0


In [38]:
data.isnull().sum(axis = 0)


State_Name          0
District_Name       0
Crop_Year           0
Season              0
Crop                0
Area                0
Production       3730
dtype: int64

In [39]:
def missing_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns

In [40]:
missing_values_table(data)

Your selected dataframe has 7 columns.
There are 1 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
Production,3730,1.5


In [41]:
# dropping rows with null values in production
data = data.dropna()
data = data.reset_index(drop=True)
data.isnull().sum(axis = 0)


State_Name       0
District_Name    0
Crop_Year        0
Season           0
Crop             0
Area             0
Production       0
dtype: int64

In [42]:
data['P_in_tonnes_per_hectar'] = data['Production']/data['Area']
data = data.drop(['Production'],axis=1)

In [44]:
data

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,P_in_tonnes_per_hectar
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,1.594896
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,0.500000
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,3.147059
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,3.642045
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,0.229167
...,...,...,...,...,...,...,...
242356,West Bengal,PURULIA,2014,Summer,Rice,306.0,2.617647
242357,West Bengal,PURULIA,2014,Summer,Sesamum,627.0,0.738437
242358,West Bengal,PURULIA,2014,Whole Year,Sugarcane,324.0,50.154321
242359,West Bengal,PURULIA,2014,Winter,Rice,279151.0,2.141848


In [45]:
#encode the categorical values in these columns 'State_Name','District_Name','Crop_Year','Season','Crop' to numerical values
from sklearn.preprocessing import LabelEncoder
stateEncoder = LabelEncoder()
data['State_Name'] = stateEncoder.fit_transform(data['State_Name'])

districtEncoder = LabelEncoder()
data['District_Name'] = districtEncoder.fit_transform(data['District_Name'])

yearEncoder = LabelEncoder()
data['Crop_Year'] = yearEncoder.fit_transform(data['Crop_Year'])

seasonEncoder = LabelEncoder()
data['Season'] = seasonEncoder.fit_transform(data['Season'])

cropNameEncoder = LabelEncoder()
data['Crop'] = cropNameEncoder.fit_transform(data['Crop'])

In [46]:
data

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,P_in_tonnes_per_hectar
0,0,427,3,1,2,1254.0,1.594896
1,0,427,3,1,74,2.0,0.500000
2,0,427,3,1,95,102.0,3.147059
3,0,427,3,4,7,176.0,3.642045
4,0,427,3,4,22,720.0,0.229167
...,...,...,...,...,...,...,...
242356,32,471,17,3,95,306.0,2.617647
242357,32,471,17,3,102,627.0,0.738437
242358,32,471,17,4,106,324.0,50.154321
242359,32,471,17,5,95,279151.0,2.141848


In [59]:
# Save these encoders so that they can be used in later phase
import numpy as np
#saving encoder
#encoder = LabelEncoder()
#encoder.fit(X)
#numpy.save('classes.npy', encoder.classes_)

#loading encoder
#encoder = LabelEncoder()
#encoder.classes_ = numpy.load('classes.npy')

np.save('stateEncoderClasses.npy',stateEncoder)
np.save('districtEncoderClasses.npy',stateEncoder)
np.save('yearEncoderClasses.npy',stateEncoder)
np.save('seasonEncoderClasses.npy',stateEncoder)
np.save('cropNameEncoderClasses.npy',stateEncoder)

In [60]:
# split into input (X) and output (Y) variables

# To convert into ndarray and then to X and Y
dataset = data.values
X = dataset[:,0:6]
Y = dataset[:,6]

In [61]:
print("X shape = ",X.shape)
print("Y shape = ",Y.shape)

X shape =  (242361, 6)
Y shape =  (242361,)


In [64]:
# Import the keras modules
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import Input, Dense
from keras.models import Model


def build_model():
  model = keras.Sequential([
    layers.Dense(128, activation='relu', input_shape=(6,)),
    #layers.Dense(64*64, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
  ])

  optimizer = tf.keras.optimizers.RMSprop(0.001)

  model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
  return model

model = build_model()
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 128)               896       
_________________________________________________________________
dense_11 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 65        
Total params: 9,217
Trainable params: 9,217
Non-trainable params: 0
_________________________________________________________________


In [65]:
model.fit( X,Y,epochs=10, validation_split = 0.2)

Train on 193888 samples, validate on 48473 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1f0529d0888>