# AgriResponse: simulation of RRM1 on the question bank

Developer: Mr. Samarth Godara

In [None]:
#for calculation of LD
!pip install distance

In [None]:
#for handling the knowledge base
import pandas as pd
#calculation of LD
import distance
#calculation of response-retrieval time
import datetime
#turning off the warnings
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
#reading the dataset
dataset = pd.read_csv("/content/drive/MyDrive/Research Backups/Project - NLP on KCC/Dataset/kcc_dataset_processed.csv")
print(dataset.columns)

In [None]:
#string cleaning to remove any extra tabs/spaces at the ends of the crop names
def strip_str(x):
  try:
    return x['Crop'].strip()
  except:
    return ""

dataset['Crop']=dataset.apply(strip_str, axis=1)

In [None]:
#code of RRM1
def model_1(crop, problem):

  #n-gram splitting function
  def split_query(problem,x):
    n_word = len(problem.split())
    words = x['QueryText'].split()
    word_bag = []
    for i in range(len(words)-(n_word-1)):
      word_bag.append(" ".join(words[i:(i+n_word)]))
    return word_bag

  #LD filter
  def l_match(x): 
    try:
      word_list = split_query(problem,x)
      problem_str=problem.lower()
      for word in word_list:
        if distance.levenshtein(problem_str, word.lower())<2:
          return True
      return False
    except:
      return False

  #crop name-based filter
  crp_dataset = dataset[dataset['Crop']==crop]
  #LD-based filter
  match = crp_dataset.apply(l_match, axis=1)
  #extracting remaining answer
  ans_dataset = crp_dataset[match]

  if ans_dataset.shape[0]==0:
    print("No answer found corresponding to the input disease...")
    return []

  #length-based filtering
  ans_dataset["AnsLength"]= ans_dataset["KccAns"].str.len()
  ans_dataset = ans_dataset[ans_dataset['AnsLength']<100]
  #length-based sorting 
  ans_dataset.sort_values(by=['AnsLength'], ascending=False, inplace=True)

  #answer output
  return ans_dataset['KccAns'].head(5).values

In [None]:
#reading the question bank
query_bank = pd.read_csv("/content/drive/MyDrive/Research Backups/Project - NLP on KCC/Dataset/Question Bank.csv")

print(query_bank.columns)

In [None]:
#simulation code

count = 1

#ask question from the model, and record the responses
def ask_quest(x):
  global count
  print("\nQuestion #",count)
  count+=1
  #note the starting and ending time - RRT
  t1 = datetime.datetime.now()
  answers = model_1(x['Crop'],x['Problem'])
  t2 = datetime.datetime.now()
  t=t2-t1

  print("Time consumed : ", t.total_seconds())
  #print(" Answers : \n", answers)

  record = []
  for answer in answers:
    record.append(answer)

  n_ans = len(answers)
  while n_ans<5:
    record.append("No answer")
    n_ans+=1
  record.append(t.total_seconds())

  #return the retrieved answers corresponding to the input query
  return record

#run the simulation
ans_time = query_bank.apply(ask_quest, axis=1)

In [None]:
#sample tuple stored during the simulation - 5 answers and RRT in seconds
ans_time[0]

In [None]:
#storing the results in a separate file

dummy = pd.DataFrame()

for item in ans_time:
  rec = {'Ans1':item[0],'Ans2':item[1],'Ans3':item[2],'Ans4':item[3],'Ans5':item[4],'Time':item[5]}
  dummy=dummy.append(rec, ignore_index=True)

output = pd.concat([query_bank, dummy], axis=1)

In [None]:
output

In [None]:
#saving the simulation results
#output.to_csv("/content/drive/MyDrive/Research Backups/Project - NLP on KCC/Dataset/Question Bank_model_1.csv",index=False)

In [None]:
#reading the simulation results for calculation of AP and CWPS
output = pd.read_csv("/content/drive/MyDrive/Research Backups/Project - NLP on KCC/Dataset/Question Bank_model_1.csv")

In [None]:
#calculation of crop weights
crp_w = {}
total = 0
for crop in output.Crop.unique():
  q_count=dataset[dataset['Crop']==crop].shape[0]
  #print(crop," : ",q_count)
  crp_w[crop]=q_count
  total=total+q_count
#print("Total queries : ", total)

for crop in output.Crop.unique():
  crp_w[crop]=(crp_w[crop]*1)/total

crp_w_dataset = pd.DataFrame.from_dict(crp_w, orient='index', columns=['Weight'])

#storing the crop weighted-score corresponding to each crops in a separate file
crp_w_dataset.to_csv("/content/drive/MyDrive/Research Backups/Project - NLP on KCC/Dataset/crop_weightage.csv",index=True)

In [None]:
#calculation of CWPS corresponding to each query
def crop_w_score(x):
  if x['Ans1']!='No answer':
    return crp_w[x['Crop']]/5
  else:
    return 0.0

#printing CWPS
output['Crop_w_score']=output.apply(crop_w_score, axis=1)
print("Crop-weighted score : ", output['Crop_w_score'].sum())

In [None]:
#storing the CWPS corresponding to each query along with the simulation results
#output.to_csv("/content/drive/MyDrive/Research Backups/Project - NLP on KCC/Dataset/Question Bank_model_1.csv",index=False)
#print("Crop-weighted score : ", output['Crop_w_score'].sum())

In [None]:
no_ans = output[output['Ans1']=='No answer']
print("Unanswered Queries : ",no_ans.shape[0])
print("Accuracy : ",(1-(no_ans.shape[0]/output.shape[0]))*100,"%")
print("Mean query response time : ", output['Time'].mean()," Seconds")

In [None]:
#printing the frequency distribution histogram of the RRT
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})
plt.hist(output['Time'], bins=50)
plt.gca().set(title='Frequency Histogram', ylabel='Frequency');