In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import warnings
warnings.filterwarnings('ignore')

In [2]:
with open('/content/agriculture_structured_dataset.json', 'r') as f:
    dataset = json.load(f)

In [3]:
dataset[:10]

[{'id': 1,
  'category': 'Government Schemes',
  'question': 'What government scheme is available for rice farmers in India?',
  'answer': 'The Government of India offers schemes like PM-KISAN and crop insurance for rice farmers to provide income support and risk coverage.'},
 {'id': 2,
  'category': 'Government Schemes',
  'question': 'What government scheme is available for wheat farmers in India?',
  'answer': 'The Government of India offers schemes like PM-KISAN and crop insurance for wheat farmers to provide income support and risk coverage.'},
 {'id': 3,
  'category': 'Government Schemes',
  'question': 'What government scheme is available for maize farmers in India?',
  'answer': 'The Government of India offers schemes like PM-KISAN and crop insurance for maize farmers to provide income support and risk coverage.'},
 {'id': 4,
  'category': 'Government Schemes',
  'question': 'What government scheme is available for millets farmers in India?',
  'answer': 'The Government of Indi

In [4]:
dataset = pd.DataFrame(dataset)

In [5]:
dataset.head(3)

Unnamed: 0,id,category,question,answer
0,1,Government Schemes,What government scheme is available for rice f...,The Government of India offers schemes like PM...
1,2,Government Schemes,What government scheme is available for wheat ...,The Government of India offers schemes like PM...
2,3,Government Schemes,What government scheme is available for maize ...,The Government of India offers schemes like PM...


In [6]:
dataset.category.unique()

array(['Government Schemes', 'Fertilizers', 'Pest Control',
       'Storage Tips'], dtype=object)

In [7]:
dataset.isnull().sum()

Unnamed: 0,0
id,0
category,0
question,0
answer,0


In [8]:
dataset.duplicated().sum()

np.int64(0)

In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        400 non-null    int64 
 1   category  400 non-null    object
 2   question  400 non-null    object
 3   answer    400 non-null    object
dtypes: int64(1), object(3)
memory usage: 12.6+ KB


In [10]:
dataset.shape

(400, 4)

In [11]:
x=dataset['question']
y=dataset['answer']

In [12]:
import tensorflow
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM,Dense,BatchNormalization,Dropout,Bidirectional,Embedding
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [13]:
max_len=max([len(i) for i in x])
vocab_size=5000

In [14]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [15]:
encoder=LabelEncoder()
y=encoder.fit_transform(y)
y=to_categorical(y,num_classes=len(encoder.classes_))

In [16]:
tokenizer=Tokenizer(oov_token='nothing',num_words=vocab_size)

tokenizer.fit_on_texts(x)
x=tokenizer.texts_to_sequences(x)

In [17]:
x=pad_sequences(x,padding='pre',maxlen=max_len)

In [24]:
model=Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=128,input_length=max_len))
model.add(Bidirectional(LSTM(256,dropout=0.3,return_sequences=True)))
model.add(Bidirectional(LSTM(256,dropout=0.3)))
model.add(Dense(128,activation='relu'))
model.add(Dense(len(encoder.classes_),activation='softmax'))
model.build(input_shape=(None,max_len))
model.summary()

In [25]:
from tensorflow.keras.callbacks import EarlyStopping

model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

earlystop=EarlyStopping(
    restore_best_weights=True,
    monitor='val_loss',
    patience=10,
    verbose=1
)
history=model.fit(x,y,epochs=100,validation_split=0.2,callbacks=[earlystop])

Epoch 1/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 110ms/step - accuracy: 0.0240 - loss: 4.3844 - val_accuracy: 0.0000e+00 - val_loss: 4.4377
Epoch 2/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.0163 - loss: 4.3651 - val_accuracy: 0.0000e+00 - val_loss: 4.6641
Epoch 3/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.0245 - loss: 4.2700 - val_accuracy: 0.0000e+00 - val_loss: 4.6174
Epoch 4/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.0418 - loss: 4.0017 - val_accuracy: 0.0000e+00 - val_loss: 4.8253
Epoch 5/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.0276 - loss: 3.7714 - val_accuracy: 0.0000e+00 - val_loss: 5.2494
Epoch 6/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.0461 - loss: 3.4231 - val_accuracy: 0.0000e+00 - val_loss: 4.6365
Epo

In [26]:
test = "What are the best storage practices for harvested tomato?"
test_seq = tokenizer.texts_to_sequences([test])
test_pad = pad_sequences(test_seq, padding='pre', maxlen=max_len)

y_pred = model.predict(test_pad)
pred_index = np.argmax(y_pred,axis=1)[0]

predicted_response = encoder.inverse_transform([pred_index])[0]
print("Predicted Response:", predicted_response)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 603ms/step
Predicted Response: After harvesting tomato, ensure proper drying and store in moisture-free containers or cold storage to prevent spoilage.


In [31]:
# model.save("model.keras")

In [32]:
# import joblib

# joblib.dump(tokenizer,"tokenizer.pkl")

In [33]:
# joblib.dump(encoder,'y_encoder.pkl')