# AgriResponse: simulation of RRM3 on the question bank

Developer: Mr. Samarth Godara

In [None]:
#for calculation of LD
!pip install distance

In [None]:
#for handling the knowledge base
import pandas as pd
#for calculation of LD
import distance
#for calculation of RRT
import datetime
#calculation of other mathematical operations
import math
#turning off the warnings
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
#creating geo-matrix
#reading the geolocations of the centers of the states
state_geo = pd.read_csv("/content/drive/MyDrive/Research Backups/Project - NLP on KCC/Dataset/state_geo.csv")
#print(state_geo.columns)
state_geo_mat = pd.DataFrame()
state_geo_mat['StateName']=state_geo['StateName']

#calculating euclidean distance between each pair of states
def geo_dist(geo_loc, st_geo):
  x1= float(geo_loc.split(',')[0])
  y1= float(geo_loc.split(',')[1])
  x2= float(st_geo.split(',')[0])
  y2= float(st_geo.split(',')[1])
  d = math.sqrt(((x1-x2)**2)+((y1-y2)**2))
  return d

#creating geo-matrix
for state in state_geo['StateName']:
  st_geo = state_geo[state_geo['StateName']==state]['Geolocation'].iloc[0]
  state_dist = state_geo.apply(lambda x : geo_dist(x['Geolocation'], st_geo), axis=1)
  state_geo_mat[state]=state_dist

#print(state_geo_mat)

In [None]:
#reading the knowledge base
dataset = pd.read_csv("/content/drive/MyDrive/Research Backups/Project - NLP on KCC/Dataset/kcc_dataset_processed.csv")
print(dataset.columns)

In [None]:
#module to remove unwanted spaces/tabs from the ends of crop names
def strip_str(x):
  try:
    return x['Crop'].strip()
  except:
    return ""

dataset['Crop']=dataset.apply(strip_str, axis=1)

In [None]:
#code of RRM3
def model_3(crop, problem, state):
  #obtain the sorted list of states in descending order of their distance with the input state
  st_names = state_geo_mat.sort_values(state)['StateName']
  #iteration of state switching
  for st in st_names:
    answers = model_2(crop, problem, st)
    if len(answers)!=0:
      print("Answers found in :", st)
      break
    print("No Answers found in :", st)

  #returning the obtained answers
  return answers

In [None]:
#code of RRM2 - used by RRM3 internally
def model_2(crop, problem, state):

  #n-gram splitting module
  def split_query(problem,x):
    n_word = len(problem.split())
    words = x['QueryText'].split()
    word_bag = []
    for i in range(len(words)-(n_word-1)):
      word_bag.append(" ".join(words[i:(i+n_word)]))
    return word_bag

  #LD calculating module
  def l_match(x): 
    try:
      word_list = split_query(problem,x)
      problem_str=problem.lower()
      for word in word_list:
        if distance.levenshtein(problem_str, word.lower())<2:
          return True
      return False
    except:
      return False

  #state based filter
  st_dataset = dataset[dataset['StateName']==state]
  #crop based filter
  crp_dataset = st_dataset[st_dataset['Crop']==crop]
  #LD based filter
  match = crp_dataset.apply(l_match, axis=1)
  #print("Searching for the answers in the dataset...")
  ans_dataset = crp_dataset[match]

  if ans_dataset.shape[0]==0:
    #print("No answer found corresponding to the input disease...")
    return []

  #answer-length based filter
  ans_dataset["AnsLength"]= ans_dataset["KccAns"].str.len()
  ans_dataset = ans_dataset[ans_dataset['AnsLength']<100]
  ans_dataset.sort_values(by=['AnsLength'], ascending=False, inplace=True)

  #returning all the retrieved answers
  return ans_dataset['KccAns'].head(5).values

In [None]:
#reading the question bank for the simulation
query_bank = pd.read_csv("/content/drive/MyDrive/Research Backups/Project - NLP on KCC/Dataset/Question Bank.csv")
print(query_bank.columns)

Index(['Crop', 'Problem', 'State'], dtype='object')


In [None]:
#simulation code

count = 1
#ask question to the RRM3 iteratively
def ask_quest(x):
  global count
  print("\nQuestion #",count)
  count+=1

  #note the starting and ending time to calculate the RTT
  t1 = datetime.datetime.now()
  answers = model_3(x['Crop'],x['Problem'],x['State'])
  t2 = datetime.datetime.now()
  t=t2-t1

  print("Time consumed : ", t.total_seconds())
  #print(" Answers : \n", answers)

  record = []
  for answer in answers:
    record.append(answer)
  n_ans = len(answers)
  while n_ans<5:
    record.append("No answer")
    n_ans+=1
  record.append(t.total_seconds())

  #return the answers retrieved corresponding to the asked query
  return record

#run simulation
ans_time = query_bank.apply(ask_quest, axis=1)

In [None]:
#saving the simulatino results in a dataframe - 5 answers and the RRT
dummy = pd.DataFrame()

for item in ans_time:
  rec = {'Ans1':item[0],'Ans2':item[1],'Ans3':item[2],'Ans4':item[3],'Ans5':item[4],'Time':item[5]}
  dummy=dummy.append(rec, ignore_index=True)

output = pd.concat([query_bank, dummy], axis=1)

In [None]:
#saving the simulation results in a file
#output.to_csv("/content/drive/MyDrive/Research Backups/Project - NLP on KCC/Dataset/Question Bank_model_3.csv",index=False)

In [None]:
#reading the simulation results for calculation of metrics
output = pd.read_csv("/content/drive/MyDrive/Research Backups/Project - NLP on KCC/Dataset/Question Bank_model_3.csv")

#calculate crop weightages
crp_w = {}
total = 0
for crop in output.Crop.unique():
  q_count=dataset[dataset['Crop']==crop].shape[0]
  #print(crop," : ",q_count)
  crp_w[crop]=q_count
  total=total+q_count
#print("Total queries : ", total)

for crop in output.Crop.unique():
  crp_w[crop]=(crp_w[crop]*1)/total

#calculate CWPS for each question in the bank
def crop_w_score(x):
  if x['Ans1']!='No answer':
    return crp_w[x['Crop']]/5
  else:
    return 0.0

output['Crop_w_score']=output.apply(crop_w_score, axis=1)

print("Crop-weighted score : ", output['Crop_w_score'].sum())

#save simulation results with CWPS
output.to_csv("/content/drive/MyDrive/Research Backups/Project - NLP on KCC/Dataset/Question Bank_model_3_qws.csv",index=False)
#print("Crop-weighted score : ", output['Crop_w_score'].sum())

In [None]:
#show the accuracy results
no_ans = output[output['Ans1']=='No answer']
#no_ans['Crop'].unique()
print("Unanswered Queries : ",no_ans.shape[0])
print("Accuracy : ",(1-(no_ans.shape[0]/output.shape[0]))*100,"%")
print("Mean query response time : ", output['Time'].mean()," Seconds")

In [None]:
#display the frequency distribution of the RRT
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})
plt.hist(output['Time'], bins=50)
plt.gca().set(title='Frequency Histogram', ylabel='Frequency');