# AgriResponse: simulation of RRM2 on the question bank

Developer: Mr. Samarth Godara

In [None]:
#for calculation of LD
!pip install distance

In [None]:
#for handling knowledge base
import pandas as pd
#for calculating LD
import distance
#for computing RRT
import datetime

#turning off warnings
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
#reading the knowledge base
dataset = pd.read_csv("/content/drive/MyDrive/Research Backups/Project - NLP on KCC/Dataset/kcc_dataset_processed.csv")
print(dataset.columns)

In [None]:
#removing unwanted spaces/tabs at the ends of crop names if any
def strip_str(x):
  try:
    return x['Crop'].strip()
  except:
    return ""

dataset['Crop']=dataset.apply(strip_str, axis=1)

In [None]:
#code of RRM2
def model_2(crop, problem, state):

  #n-gram splitting function - RRM1
  def split_query(problem,x):
    n_word = len(problem.split())
    words = x['QueryText'].split()
    word_bag = []
    for i in range(len(words)-(n_word-1)):
      word_bag.append(" ".join(words[i:(i+n_word)]))
    return word_bag

  #LD calcuation module - RRM1
  def l_match(x): 
    try:
      word_list = split_query(problem,x)
      problem_str=problem.lower()
      for word in word_list:
        if distance.levenshtein(problem_str, word.lower())<2:
          return True
      return False
    except:
      return False

  #selecting dataset corresponding to the input state name - RRM2
  st_dataset = dataset[dataset['StateName']==state]

  #crop name-based filter
  crp_dataset = st_dataset[st_dataset['Crop']==crop]
  #LD-based filter
  match = crp_dataset.apply(l_match, axis=1)
  #extracting the remaining answers
  ans_dataset = crp_dataset[match]

  if ans_dataset.shape[0]==0:
    print("No answer found corresponding to the input disease...")
    return []

  #answer-length based filter
  ans_dataset["AnsLength"]= ans_dataset["KccAns"].str.len()
  ans_dataset = ans_dataset[ans_dataset['AnsLength']<100]
  ans_dataset.sort_values(by=['AnsLength'], ascending=False, inplace=True)

  #returning the extracted answers
  return ans_dataset['KccAns'].head(5).values

In [None]:
#reading the question bank for the simulation
query_bank = pd.read_csv("/content/drive/MyDrive/Research Backups/Project - NLP on KCC/Dataset/Question Bank.csv")
print(query_bank.columns)

In [None]:
#simulation for RRM2

count = 1

#asking a questionto the model and returning the retrieved answers
def ask_quest(x):
  global count
  print("\nQuestion #",count)
  count+=1

  #note the starting and ending time for RRT calculation
  t1 = datetime.datetime.now()
  answers = model_2(x['Crop'],x['Problem'],x['State'])
  t2 = datetime.datetime.now()
  t=t2-t1

  print("Time consumed : ", t.total_seconds())
  #print(" Answers : \n", answers)

  record = []
  for answer in answers:
    record.append(answer)
  n_ans = len(answers)
  while n_ans<5:
    record.append("No answer")
    n_ans+=1
  record.append(t.total_seconds())

  #return the answer retrived in simulation
  return record

#run the simulation
ans_time = query_bank.apply(ask_quest, axis=1)

In [None]:
#storing the simulation results in separate file - 5 answers and RRT
dummy = pd.DataFrame()

for item in ans_time:
  rec = {'Ans1':item[0],'Ans2':item[1],'Ans3':item[2],'Ans4':item[3],'Ans5':item[4],'Time':item[5]}
  dummy=dummy.append(rec, ignore_index=True)

output = pd.concat([query_bank, dummy], axis=1)

In [None]:
#storing the simulation results
#output.to_csv("/content/drive/MyDrive/Research Backups/Project - NLP on KCC/Dataset/Question Bank_model_2.csv",index=False)

In [None]:
#reading the simulation results for AP, CWPS, and ARRT calculation
output = pd.read_csv("/content/drive/MyDrive/Research Backups/Project - NLP on KCC/Dataset/Question Bank_model_2.csv")

In [None]:
#calculating crop-weightages for calculating CWPS
crp_w = {}
total = 0
for crop in output.Crop.unique():
  q_count=dataset[dataset['Crop']==crop].shape[0]
  #print(crop," : ",q_count)
  crp_w[crop]=q_count
  total=total+q_count
#print("Total queries : ", total)

for crop in output.Crop.unique():
  crp_w[crop]=(crp_w[crop]*1)/total

#calculating CWPS using crop weightages
def crop_w_score(x):
  if x['Ans1']!='No answer':
    return crp_w[x['Crop']]/5
  else:
    return 0.0

#storing the CWPS in dataframe
output['Crop_w_score']=output.apply(crop_w_score, axis=1)

print("Crop-weighted score : ", output['Crop_w_score'].sum())

In [None]:
#storing the CWPS along with the simulation results
output.to_csv("/content/drive/MyDrive/Research Backups/Project - NLP on KCC/Dataset/Question Bank_model_2_qws.csv",index=False)
#print("Crop-weighted score : ", output['Crop_w_score'].sum())

In [None]:
#printing the metrics performances - these are not correct, as manual inspection is required
no_ans = output[output['Ans1']=='No answer']
#no_ans['Crop'].unique()
print("Unanswered Queries : ",no_ans.shape[0])
print("Accuracy : ",(1-(no_ans.shape[0]/output.shape[0]))*100,"%")
print("Mean query response time : ", output['Time'].mean()," Seconds")

In [None]:
#plotting histogram of the frequency distribution of the RRT 
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})
plt.hist(output['Time'], bins=50)
plt.gca().set(title='Frequency Histogram', ylabel='Frequency');