#Import and Mount Statements

In [0]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, chi2,f_classif, mutual_info_classif, SelectPercentile, SelectFpr, SelectFdr, SelectFwe, GenericUnivariateSelect

In [32]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
data = pd.read_csv("gdrive/My Drive/ASAG Work/All Data/SemEval_till_POSNew.csv",sep=',')

In [0]:
features = ['q_word_count', 'q_char_count',
       'q_avg_word', 'r_word_count', 'r_char_count', 'r_avg_word',
       's_word_count', 's_char_count', 's_avg_word', 'Row_no', 'q_nouns',
       'q_adjectives', 'q_verbs', 'q_nouns_vs_length',
       'q_adjectives_vs_length', 'q_verbs_vs_length', 'q_nouns_vs_words',
       'q_adjectives_vs_words', 'q_verbs_vs_words', 'r_nouns', 'r_adjectives',
       'r_verbs', 'r_nouns_vs_length', 'r_adjectives_vs_length',
       'r_verbs_vs_length', 'r_nouns_vs_words', 'r_adjectives_vs_words',
       'r_verbs_vs_words', 's_nouns', 's_adjectives', 's_verbs',
       's_nouns_vs_length', 's_adjectives_vs_length', 's_verbs_vs_length',
       's_nouns_vs_words', 's_adjectives_vs_words', 's_verbs_vs_words',
       'precision', 'recall', 'F1_score', 'how_flag', 'what_flag', 'why_flag',
       'who_flag', 'which_flag', 'when_flag', 'where_flag', 'whom_flag']

label = 'accuracy'

In [35]:
len(features)

48

In [0]:
def get_req_data(data,features,label,tr_ts,n_class,q_embed=None,r_embed=None,s_embed=None):
    indices = data[(data['train/test'].str.contains(tr_ts)) & (data['train/test'].str.contains(str(n_class)))].index.tolist()
    d = data.loc[indices,:].reset_index(drop=True)
    f = d.loc[:,features]
    y = d.loc[:,label]
    if q_embed == None:
        return f,y
    else:
        q = q_embed.iloc[indices,:].reset_index(drop=True)
        r = r_embed.iloc[indices,:].reset_index(drop=True)
        s = s_embed.iloc[indices,:].reset_index(drop=True)
        return f,q,r,s,y

In [0]:
f_train,y_train = get_req_data(data,features,label,'train',3)
f_test1,y_test1 = get_req_data(data,features,label,'unseen-answers',3)
f_test2,y_test2 = get_req_data(data,features,label,'unseen-questions',3)
f_test3,y_test3 = get_req_data(data,features,label,'unseen-domains',3)

In [0]:
df = pd.DataFrame()

In [0]:
def reorder(array, order):
  new_arr = list(range(len(order)))
  for i in range(len(array)):
    new_arr[order[i]-1] = array[i] 
  return new_arr

In [0]:
grid = pd.read_csv("gdrive/My Drive/ASAG Work/Results/3way_tree_grid_search.csv",sep=',')

#For 3 way, G1

In [0]:
g1 = grid.sort_values(by=['UA_m_f1','UA_w_f1','UQ_m_f1','UQ_w_f1','UD_m_f1','UD_w_f1'],ascending=False)
g1.iloc[:5,[0,1,2,3,6,15,9,18,12,21]]

In [0]:
for i in range(5):
  if(g1.iloc[i,0] == 'RFC'):
    model = RandomForestClassifier(n_estimators = g1.iloc[i,1], criterion = g1.iloc[i,2])
    print("Model Created")
  rfe = RFE(model,1)
  fit = rfe.fit(f_train, y_train)
  print('Model Fitted')
  df[str(g1.iloc[i,0])+ "," + str(g1.iloc[i,1])+ ","+ str(g1.iloc[i,2])] = reorder(features, fit.ranking_)

In [0]:
df

In [0]:
df.to_csv("gdrive/My Drive/ASAG Work/All Data/Feature Selection.csv", sep=',')

#For 3way, G2

In [0]:
g2 = grid.sort_values(by=['UD_m_f1','UD_w_f1','UQ_m_f1','UQ_w_f1','UA_m_f1','UA_w_f1'],ascending=False)
g2.iloc[:5,[0,1,2,3,6,15,9,18,12,21]]

In [0]:
for i in [2,3,4]:
  name = ""
  if(g2.iloc[i,0] == 'RFC'):
    model = RandomForestClassifier(n_estimators = g2.iloc[i,1], criterion = g2.iloc[i,2])
    name = str(g2.iloc[i,0])+ "," + str(g2.iloc[i,1])+ ","+ str(g2.iloc[i,2])
    print("RFC Model Created")
  elif(g2.iloc[i,0] == 'GBC'):
    model = GradientBoostingClassifier(learning_rate=g2.iloc[i,3], n_estimators= g2.iloc[i,1])
    name = str(g2.iloc[i,0])+ "," + str(g2.iloc[i,1])+ ","+ str(g2.iloc[i,3])
    print("GBC Model Created")
  rfe = RFE(model,1)
  fit = rfe.fit(f_train, y_train)
  print('Model Fitted')
  df[name] = reorder(features, fit.ranking_)

RFC Model Created
Model Fitted
GBC Model Created
Model Fitted
GBC Model Created
Model Fitted


In [0]:
df.head()

Unnamed: 0,"RFC,400,gini","RFC,200,entropy","RFC,200,gini","RFC,400,entropy","RFC,100,gini","GBC,100,0.1","GBC,400,0.01","RFC,300,gini","GBC,300,0.01","GBC,500,0.01"
0,Row_no,Row_no,Row_no,Row_no,Row_no,recall,recall,Row_no,recall,recall
1,F1_score,F1_score,F1_score,F1_score,F1_score,Row_no,Row_no,recall,Row_no,Row_no
2,recall,s_char_count,recall,s_char_count,s_char_count,r_avg_word,r_avg_word,s_char_count,r_char_count,r_avg_word
3,s_char_count,recall,s_char_count,recall,recall,r_char_count,r_char_count,F1_score,r_avg_word,r_char_count
4,s_avg_word,s_avg_word,s_avg_word,s_avg_word,s_avg_word,q_nouns_vs_words,F1_score,s_avg_word,F1_score,q_nouns_vs_length


In [0]:
df.to_csv("gdrive/My Drive/ASAG Work/All Data/Feature Selection.csv", sep=',')

#Select KBest

In [0]:
df = pd.read_csv("gdrive/My Drive/ASAG Work/All Data/Feature Selection.csv", sep=',')

In [85]:
score_functions = [f_classif, mutual_info_classif, chi2]
for funct in score_functions:
  model = SelectKBest(score_func=funct, k=48).fit(f_train, y_train)
  name = 'kBest' + str(funct).split(' ')[1]
  for i in range(len(model.scores_)):
    if np.isnan(model.scores_[i]):
      print("NAN")
      model.scores_[i] = 0
  print(model.scores_)
  order = sorted(model.scores_, reverse=True)
  index = [order.index(v)+1 for v in model.scores_]  
  df[name] = reorder(features, index)
  print(str(name), "Created")

  f = msb / msw


NAN
[  59.44834663   60.68569408   23.49878644  103.85285465  138.40070148
   60.88261709   67.1562406    62.25126954   15.80990604  185.06021031
   71.65227785   32.8018779    56.73828305    8.29470436    1.8530389
   27.65891649   12.50101757    4.27005082   45.87637238  142.22873446
   42.26445136   64.76335295    8.58193683    8.65281563   16.15067933
    8.12050943    6.07013842    2.75101287   36.27824742   98.09415726
   56.53581838   28.33672481   27.50726817   24.91413122   28.15766048
   32.51709772   18.69240002  130.93249716 1077.70047302  587.7738289
   55.90516729    8.30121628   11.08768956    8.98627379   22.08162134
    5.65254518    7.30348998    0.        ]
kBestf_classif Created
[0.08723203 0.14077854 0.15005881 0.04633506 0.12986324 0.14058948
 0.02576446 0.01130546 0.02499991 0.21624447 0.0497195  0.02023822
 0.03360305 0.14960457 0.10538416 0.14169419 0.10760673 0.08748445
 0.10411203 0.03721461 0.00218313 0.02153948 0.13453106 0.10995882
 0.13869122 0.07479549 0

In [86]:
df.head()


Unnamed: 0.1,Unnamed: 0,"RFC,400,gini","RFC,200,entropy","RFC,200,gini","RFC,400,entropy","RFC,100,gini","GBC,100,0.1","GBC,400,0.01","RFC,300,gini","GBC,300,0.01","GBC,500,0.01",kBestf_classif,kBestmutual_info_classif,kBestchi2
0,0,Row_no,Row_no,Row_no,Row_no,Row_no,recall,recall,Row_no,recall,recall,recall,Row_no,Row_no
1,1,F1_score,F1_score,F1_score,F1_score,F1_score,Row_no,Row_no,recall,Row_no,Row_no,F1_score,q_avg_word,q_char_count
2,2,recall,s_char_count,recall,s_char_count,s_char_count,r_avg_word,r_avg_word,s_char_count,r_char_count,r_avg_word,Row_no,q_nouns_vs_length,r_char_count
3,3,s_char_count,recall,s_char_count,recall,recall,r_char_count,r_char_count,F1_score,r_avg_word,r_char_count,r_nouns,q_verbs_vs_length,q_word_count
4,4,s_avg_word,s_avg_word,s_avg_word,s_avg_word,s_avg_word,q_nouns_vs_words,F1_score,s_avg_word,F1_score,q_nouns_vs_length,r_char_count,q_char_count,s_char_count


In [60]:
df

Unnamed: 0,"kBest,f_classif"
0,recall
1,F1_score
2,Row_no
3,r_nouns
4,r_char_count
5,precision
6,r_word_count
7,s_adjectives
8,q_nouns
9,s_word_count


In [0]:
index

TypeError: ignored