# Importing libraries and dataset

In [1]:
import pandas as pd
import numpy as np
import re
import math
import nltk
import itertools
from heapq import nlargest
from sklearn.model_selection import KFold

trdf = pd.read_csv("qc_train.txt", delimiter = ":",header=None, names=['Class', 'Question'])

# Removing the fine grained categories label from Questions, and cleaning the questions
ques = []
for i in range(len(trdf)):
    temp = trdf['Question'][i]
    temp = temp.split(" ",1)[1]
    temp = re.sub(r"[^a-zA-Z0-9.'\s]", ' ',temp)
    ques.append(temp)

trdf['Question'] = ques

tsdf = pd.read_csv("qc_test.txt", delimiter = ":",header=None, names=['Class', 'Question'])

# Removing the fine grained categories label from Questions, and cleaning the questions
ques = []
for i in range(len(tsdf)):
    temp = tsdf['Question'][i]
    temp = temp.split(" ",1)[1]
    temp = re.sub(r"[^a-zA-Z0-9.'\s]", ' ',temp)
    ques.append(temp)

tsdf['Question'] = ques

In [2]:
trdf.head()

Unnamed: 0,Class,Question
0,DESC,How did serfdom develop in and then leave Russ...
1,ENTY,What films featured the character Popeye Doyle
2,DESC,How can I find a list of celebrities real names
3,ENTY,What fowl grabs the spotlight after the Chines...
4,ABBR,What is the full form of .com


In [3]:
tsdf.head()

Unnamed: 0,Class,Question
0,NUM,How far is it from Denver to Aspen
1,LOC,What county is Modesto California in
2,HUM,Who was Galileo
3,DESC,What is an atom
4,NUM,When did Hawaii become a state


In [4]:
# Getting total dataset
new_index = [i for i in range(5452,5952)]
ts = tsdf.copy()
ts.index = new_index
frames = [trdf, ts]
total_df = pd.concat(frames)
len(total_df)

5952

## Get class distribution

In [5]:
def get_dist_cls(df):
    my_list = list(df['Class'])
    freq_class = {}
    for item in my_list:
        if (item in freq_class):
            freq_class[item] += 1
        else:
            freq_class[item] = 1

    sm = sum(freq_class.values())
    for key in freq_class:
        freq_class[key] = round(freq_class[key]/sm,2)
    freq_class = sorted(freq_class.items(), key=lambda item: item[0])
    return (freq_class)

print("Training Data class distribution",get_dist_cls(trdf))
print("Testing Data class distribution",get_dist_cls(tsdf))

Training Data class distribution [('ABBR', 0.02), ('DESC', 0.21), ('ENTY', 0.23), ('HUM', 0.22), ('LOC', 0.15), ('NUM', 0.16)]
Testing Data class distribution [('ABBR', 0.02), ('DESC', 0.28), ('ENTY', 0.19), ('HUM', 0.13), ('LOC', 0.16), ('NUM', 0.23)]


# Function to get features and scores

In [6]:
def create_features(trdf):
    # Creating lexical features i.e. ngrams for each question
    lex_f_1gram = []
    lex_f_2gram = []
    lex_f_3gram = []

    # Creating a count hashmap/dict. for all ngrams
    lex_f1_dic = {}
    lex_f2_dic = {}
    lex_f3_dic = {}

    # length feature
    length_q = []

    # syntactical features i.e part of speech for each question
    pos_lol = []

    # Creating a count hashmap/dict. for all pos
    pos_dic = {}


    # Iterating for all training questions

    for i in range(len(trdf)):
        
        tokens = nltk.word_tokenize(trdf.loc[i, "Question"])  # generating tokens from question using the word_tokenize() in nltk
        
        # creating list of ngrams respectively
        
        sequences_1 = [tokens[i:] for i in range(1)]
        grams_1 = zip(*sequences_1)
        
        sequences_2 = [tokens[i:] for i in range(2)]
        grams_2 = zip(*sequences_2)
        
        sequences_3 = [tokens[i:] for i in range(3)]
        grams_3 = zip(*sequences_3)
        
        # appending the list of ngrams in respective list of lists
        lex_f_1gram.append(list(grams_1))
        lex_f_2gram.append(list(grams_2)) 
        lex_f_3gram.append(list(grams_3))
        
        pos_list = nltk.pos_tag(tokens)      # getting the tag for each token using pos_tag() function in nltk library
        pos_lol.append(pos_list)             # appending the list of tags in pos list of lists
        
        length_q.append(len(lex_f_1gram[i])) # appending the length of a each question
        
        
        # Making dictionary of lexical ngrams and syntactic part of speech
        for pos in pos_list:
            if pos[1] in pos_dic: 
                pos_dic[pos[1]] += 1
            else:
                pos_dic[pos[1]] = 1
        
        for word in lex_f_1gram[i]:
            if word in lex_f1_dic.keys():
                lex_f1_dic[word] += 1
            else:
                lex_f1_dic[word] = 1
                
        for word in lex_f_2gram[i]:
            if word in lex_f2_dic.keys():
                lex_f2_dic[word] += 1
            else:
                lex_f2_dic[word] = 1
            
        for word in lex_f_3gram[i]:
            if word in lex_f3_dic.keys():
                lex_f3_dic[word] += 1
            else:
                lex_f3_dic[word] = 1

    # Getting the total count of items in lexical features for each ngram and syntactic features
    values = pos_dic.values() 
    len_synt=sum(values)

    values = lex_f1_dic.values() 
    len_1gram=sum(values)

    values = lex_f2_dic.values() 
    len_2gram=sum(values)

    values = lex_f3_dic.values() 
    len_3gram=sum(values)

    # getting most frequent ngrams (500,300,200) for 1gram, 2gram, 3gram respectively
    lex_f1_500 = nlargest(500, lex_f1_dic, key = lex_f1_dic.get)
    lex_f2_300 = nlargest(300, lex_f2_dic, key = lex_f2_dic.get)
    lex_f3_200 = nlargest(200, lex_f3_dic, key = lex_f3_dic.get)

    # Marking the presence or absence of ngrams in dictionary to make furthur calculations fast
    gram1 = {}
    gram2 = {}
    gram3 = {}

    for key in lex_f1_dic:
        if key in lex_f1_500:
            gram1[key] = 1
        else:
            gram1[key] = 0

    for key in lex_f2_dic:
        if key in lex_f2_300:
            gram2[key] = 1
        else:
            gram2[key] = 0

    for key in lex_f3_dic:
        if key in lex_f3_200:
            gram3[key] = 1
        else:
            gram3[key] = 0

    # list of syntactic tags for 1gram present in most frequent 500 1grams and list of their corresponding scores
    Syntactic = []
    synt_scores = []

    # lists of lexical ngrams which are present in most frequent 500,300,200 ngrams respectively and lists of their scores
    Lexical_1gram = []
    Lexical_2gram = []
    Lexical_3gram = []

    lex_scores_1gram = []
    lex_scores_2gram = []
    lex_scores_3gram = []



    # Iterating for all training questions to compute above stated lists
    for i in range(len(trdf)):
        
        score_si = 0     # var to store syntactic prob score
        score_li = 0     # var to store lexical prob score
        
        # initialising empty lists to store 1gram and pos tags
        temp1 = []
        temp_S =[]
        
        for key in lex_f_1gram[i]:
            
            if gram1[key] != 0:      # checking weather given key(1gram) is present in most frequent 500 or not
                temp1.append(key[0])
                
                # computing probability score for 1gram
                score_li += math.log(lex_f1_dic[key]/len_1gram)
                
                # iterating over each questions part of speech list
                for tup in pos_lol[i]:
                    
                    if tup[0] == key[0]:      # checking wheather pos instance is same as 1gram in consideration
                        temp_S.append(tup[1]) # tup[1] is tag assigned corresponding to 1gram
                        
                        # computing probability score for part of speech tags
                        score_si += math.log(pos_dic[tup[1]]/len_synt)
        
        # appending scores, 1grams and pos tags
        synt_scores.append(abs(score_si))
        lex_scores_1gram.append(abs(score_li))
        
        Lexical_1gram.append(temp1)
        Syntactic.append(temp_S)
        
        
        # initialising empty list to store 2gram
        temp2 = []
        
        score_li = 0
        for key in lex_f_2gram[i]:
            
            if gram2[key] != 0:     # checking weather given key(2gram) is present in most frequent 300 or not
                temp2.append(key)
                
                # computing conditional probability score for 2gram
                p1 = lex_f2_dic[key]/len_2gram
                p2 = lex_f1_dic[(key[0],)]/len_1gram         # (What,is) -> key, P(is|what) = P(What,is)/P(what)
                score_li += math.log(p1/p2)
        
        # appending scores and 2grams
        lex_scores_2gram.append(abs(score_li))
        Lexical_2gram.append(temp2)
        
        
        # initialising empty list to store 3gram
        temp3 = []
        
        score_li = 0
        for key in lex_f_3gram[i]:
            if gram3[key] != 0:     # checking weather given key(3gram) is present in most frequent 200 or not
                temp3.append(key)
                
                # computing conditional probability score for 3gram
                # (what, is, the) -> key ,
                
                p1 = lex_f3_dic[key]/len_3gram              #  p1 = P(what, is, the)
                p2 = lex_f2_dic[(key[0],key[1])]/len_2gram  #  p2 = P(What,is)
                p3 = lex_f1_dic[(key[0],)]/len_1gram        #  p1 = P(what)
              
                # P(is|what) = p2/p1
                
                # final =  P(the|what,is) = P(what, is, the)/P(is|what)
                
                score_li += math.log(p1/(p2/p3))
        
        # appending scores and 3grams
        lex_scores_3gram.append(abs(score_li))
        Lexical_3gram.append(temp3)
    # Creating new columns of features
    trdf['Length'] = length_q
    trdf['Lexical_1gram'] = Lexical_1gram
    trdf['Lex_Prob_Scores_1gram'] = lex_scores_1gram
    trdf['Lexical_2gram'] = Lexical_2gram
    trdf['Lex_Prob_Scores_2gram'] = lex_scores_2gram
    trdf['Lexical_3gram'] = Lexical_3gram
    trdf['Lex_Prob_Scores_3gram'] = lex_scores_3gram
    trdf['Syntactic'] = Syntactic
    trdf['Syntactic_Prob_Scores'] = synt_scores
    return trdf

## defining values for features using create_features

In [7]:
# create features for total dataset
total_df = create_features(total_df)
total_df.head()

Unnamed: 0,Class,Question,Length,Lexical_1gram,Lex_Prob_Scores_1gram,Lexical_2gram,Lex_Prob_Scores_2gram,Lexical_3gram,Lex_Prob_Scores_3gram,Syntactic,Syntactic_Prob_Scores
0,DESC,How did serfdom develop in and then leave Russ...,9,"[How, did, in, and, then]",26.007478,"[(How, did)]",3.226824,[],0.0,"[WRB, VBD, IN, CC, RB]",18.242043
1,ENTY,What films featured the character Popeye Doyle,7,"[What, the, character]",12.625627,[],0.0,[],0.0,"[WP, DT, NN]",6.475343
2,DESC,How can I find a list of celebrities real names,10,"[How, can, I, find, a, list, of, real, names]",53.465525,"[(How, can), (can, I), (I, find), (find, a)]",5.339126,"[(How, can, I), (can, I, find), (I, find, a)]",17.262693,"[WRB, MD, PRP, VB, DT, NN, IN, JJ, NNS]",29.197039
3,ENTY,What fowl grabs the spotlight after the Chines...,12,"[What, the, after, the, Chinese, of, the]",29.855507,"[(of, the)]",1.126969,[],0.0,"[WP, DT, DT, DT, IN, DT, DT, DT, JJ, IN, DT, D...",30.417258
4,ABBR,What is the full form of .com,7,"[What, is, the, full, form, of]",28.578077,"[(What, is), (is, the)]",1.670732,"[(What, is, the)]",3.132457,"[WP, VBZ, DT, JJ, NN, IN]",14.47799


In [16]:
total_df.iloc[0:5452,[0,1,3,5,7,9]]

Unnamed: 0,Class,Question,Lexical_1gram,Lexical_2gram,Lexical_3gram,Syntactic
0,DESC,How did serfdom develop in and then leave Russ...,"[How, did, in, and, then]","[(How, did)]",[],"[WRB, VBD, IN, CC, RB]"
1,ENTY,What films featured the character Popeye Doyle,"[What, the, character]",[],[],"[WP, DT, NN]"
2,DESC,How can I find a list of celebrities real names,"[How, can, I, find, a, list, of, real, names]","[(How, can), (can, I), (I, find), (find, a)]","[(How, can, I), (can, I, find), (I, find, a)]","[WRB, MD, PRP, VB, DT, NN, IN, JJ, NNS]"
3,ENTY,What fowl grabs the spotlight after the Chines...,"[What, the, after, the, Chinese, of, the]","[(of, the)]",[],"[WP, DT, DT, DT, IN, DT, DT, DT, JJ, IN, DT, D..."
4,ABBR,What is the full form of .com,"[What, is, the, full, form, of]","[(What, is), (is, the)]","[(What, is, the)]","[WP, VBZ, DT, JJ, NN, IN]"
...,...,...,...,...,...,...
5447,ENTY,What's the shape of a camel's spine,"[What, 's, the, of, a, 's]","[(What, 's), ('s, the), (of, a)]","[(What, 's, the)]","[WP, VBZ, POS, DT, IN, DT, VBZ, POS]"
5448,ENTY,What type of currency is used in China,"[What, type, of, is, used, in, China]","[(What, type), (type, of), (used, in)]","[(What, type, of)]","[WP, NN, IN, VBZ, VBN, IN, NNP]"
5449,NUM,What is the temperature today,"[What, is, the]","[(What, is), (is, the)]","[(What, is, the)]","[WP, VBZ, DT]"
5450,NUM,What is the temperature for cooking,"[What, is, the, for]","[(What, is), (is, the)]","[(What, is, the)]","[WP, VBZ, DT, IN]"


# Creating Utillity functions and classes for tree nodes and classifier

In [8]:
# exploring distribution of values in features
def explore_features(df):
    cols = list(df.columns)
    cols.pop()
    for col in cols:
        x = df.loc[:,col]
        print("For",col)
        print("\tMin:",np.min(x))
        print("\tMax:",np.max(x))
        print("\tUnique:",len(np.unique(x)))

In [9]:
# to get training and validation set indexes for k-fold
def apply_k_fold(df,n):
    # prepare cross validation
    kfold = KFold(n)
    train_set = []
    validn_set = []

    # enumerate splits
    for train, validn in kfold.split(df):
        train_set.append(train)
        validn_set.append(validn)
    
    return (train_set,validn_set)

In [10]:
# tree node class
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):
        ''' constructor ''' 
        
        # for decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        
        # for leaf node
        self.value = value

In [None]:
# classifier class
class DecisionTreeClassifier():
    def __init__(self, min_samples_split, max_depth):
        ''' constructor '''
        
        # initialize the root of the tree 
        self.root = None
        
        # stopping conditions
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.itr = 0
        
    def build_tree(self, dataset, model, curr_depth):
        ''' recursive function to build the tree ''' 
        
        X, Y = dataset.iloc[:,:-1], dataset.iloc[:,-1]
        num_samples, num_features = np.shape(X)
        uniq = len(np.unique(Y))
        # split until stopping conditions are met
        if num_samples >= self.min_samples_split and uniq > 1 and curr_depth < self.max_depth:
            
            # find the best split
            print("Splitting for",self.itr,"th node")
            best_split = self.get_best_split(dataset, num_samples, num_features,model)
            
            print("\n\n\n**************************************************\n\n\n")
            
            # check if information gain is positive
            if best_split["info_gain"] > 0:
                
                self.itr += 1
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], model, curr_depth+1)
                
                self.itr += 1
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], model, curr_depth+1)
                
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"], 
                            left_subtree, right_subtree, best_split["info_gain"])
        
        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value = leaf_value)
    
    def get_best_split(self, dataset, num_samples, num_features, model):
        ''' function to find the best split '''
        
        # dictionary to store the best split
        best_split = {}
        best_split["info_gain"] = 0
        max_info_gain = 0

        # loop over all the features
        for feature_index in range(num_features):
            
            # ['Length', 'Lex_Prob_Scores_1gram', 'Lex_Prob_Scores_2gram', 'Lex_Prob_Scores_3gram', 'Syntactic_Prob_Scores']
            cols = list(dataset.columns)
            
            print("\n\n\tCurrently splitting over :",list(dataset.columns)[feature_index])
            
            feature_values = list(dataset.iloc[:, feature_index])
            
            a = int(min(feature_values))
            b = int(max(feature_values))
            if cols[feature_index] in ['Length','Lex_Prob_Scores_2gram','Lex_Prob_Scores_3gram'] :
                div = 1
            else:
                div = 2
            
            possible_thresholds = [i for i in range(a,b,div)]
            print("\tTotal no. of thersholds possible:",len(possible_thresholds))
            
            # loop over all the feature values present in the data
            for threshold in possible_thresholds:
                
                
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                left_len = len(dataset_left)
                right_len = len(dataset_right)
                
                # check if childs are not null
                
                if left_len>0 and right_len>0:
                    y, left_y, right_y = dataset.iloc[:, -1], dataset_left.iloc[:, -1], dataset_right.iloc[:, -1]
                    
                    # compute information gain
                    curr_info_gain = self.information_gain(y, left_y, right_y, model)
                    
                    
                    # update the best split if needed
                    if curr_info_gain > max_info_gain:
                        print("\n\t\tConsidering threshold as:",threshold)
                        print("\t\tleft_len:",left_len,"\tright_len:",right_len)
                        print("\t\tCurrent Gain is: ",curr_info_gain,"\tMaximum gain is: ",max_info_gain)
                        max_info_gain = curr_info_gain
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["info_gain"] = curr_info_gain
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        
        return best_split
    
    # to split dataset into two parts based on given feature and threshold
    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''
        ds_L = []
        ds_R = []
        for i in range(len(dataset)):
            row = dataset.iloc[i,:] 
            if row[feature_index] <= threshold:
                ds_L.append(row)
            else:
                ds_R.append(row)

        dataset_left = pd.DataFrame(ds_L)
        dataset_right = pd.DataFrame(ds_R)
        
        return dataset_left, dataset_right
    
    # calculate info gain based on type of model
    def information_gain(self, parent, l_child, r_child, model):
        ''' function to compute information gain '''
        
        weight_l = len(l_child) / len(parent)     # ni/n
        weight_r = len(r_child) / len(parent)
        if model == "gini":
            gain = self.gini_index(parent) - (weight_l*self.gini_index(l_child) + weight_r*self.gini_index(r_child))
        elif model == "cross_entropy":
            gain = self.entropy(parent) - (weight_l*self.entropy(l_child) + weight_r*self.entropy(r_child))
        else:
            gain = self.mce(parent) - (weight_l*self.mce(l_child) + weight_r*self.mce(r_child))
        return gain
    
    # calculate gini_index
    def gini_index(self, y):
        
        class_labels = np.unique(y)
        gini = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            gini += p_cls**2
        return 1 - gini
    
    # calculate cross-entropy
    def entropy(self, y):
        
        class_labels = np.unique(y)
        entropy = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            entropy += -p_cls * np.log2(p_cls)
        return entropy
    
    # calculate misclassification error
    def mce(self, y):
        
        class_labels = np.unique(y)
        mce = 0
        max_p = -float("inf")
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            if p_cls > max_p:
                max_p = p_cls
        return 1 - max_p
    
    # compute leaf node class label
    def calculate_leaf_value(self, Y):
        Y = list(Y)
        return max(Y, key=Y.count)
    
    # start training tree with given data and model
    def fit(self,dataset,model):
        
        self.root = self.build_tree(dataset,model,0)
    
    # predict new dataset class labels and return as list
    def predict(self, X):
        
        preditions = []
        for i in range(len(X)):
            row = X.iloc[i,:]
            preditions.append(self.make_prediction(row, self.root))
        return preditions
    
    # predict a single data element's class label and return it 
    def make_prediction(self, row, tree):
        
        if tree.value != None: return tree.value
        
        feature_val = row[tree.feature_index]
        if feature_val <= tree.threshold:
            return self.make_prediction(row, tree.left)
        else:
            return self.make_prediction(row, tree.right)

In [11]:
# getting the classifier object

def get_trained_classfier(tr_df,model):
    
    classifier = DecisionTreeClassifier(min_samples_split=2, max_depth=10)
    classifier.fit(tr_df,model)
    return classifier


# making predictions and creating confusion matrix

def prediction_results(classifier,test_df):
    
    classes = list(np.unique(test_df['Class']))
    class_actual = list(test_df.iloc[:,-1])
    class_predicted = classifier.predict(test_df.iloc[:,:-1])

    total = len(test_df)
    confusion_matrix = pd.DataFrame(0, index = classes, columns = classes)

    for i in range(total):
        confusion_matrix.loc[class_actual[i],class_predicted[i]] += 1
    prec = []
    rec = []
    f1 = []
    sum_all = sum(sum(np.array(confusion_matrix.iloc[:,:])))
    sum_tp = 0
    for cls in classes:
        tp = confusion_matrix.loc[cls,cls]
        sum_tp += tp
        ap = sum(confusion_matrix.loc[cls,:])
        pp = sum(confusion_matrix.loc[:,cls])
        if pp == 0:
            p = 0
        else:
            p = tp/pp
        if ap == 0:
            r = 0
        else:
            r = tp/ap
        if p+r == 0:
            f = 0
        else:
            f = 2*p*r/(p+r)
        prec.append(round(p*100,2))
        rec.append(round(r*100,2))
        f1.append(round(f*100,2))
            
    confusion_matrix.loc['Precision'] = prec
    confusion_matrix.loc['Recall'] = rec
    confusion_matrix.loc['f1_score'] = f1
    accuracy = round(100*sum_tp/sum_all,2)
    return (accuracy,confusion_matrix)

# Making required datsets for training and testing

In [12]:
# Separating training and testing data based on feature combination set chosen
lists = [[2,4,6,8,10,0],[2,4,6,8,0],[2,4,6,0],[2,4,0]]
tr_df_list = []
ts_df_list = []
for i in range(len(lists)):
    temp = total_df.iloc[0:5452,lists[i]].copy()
    tr_df_list.append(temp)
    temp = total_df.iloc[5452:5953,lists[i]].copy()
    ts_df_list.append(temp)

In [13]:
tr_df_list[0]

Unnamed: 0,Length,Lex_Prob_Scores_1gram,Lex_Prob_Scores_2gram,Lex_Prob_Scores_3gram,Syntactic_Prob_Scores,Class
0,9,26.007478,3.226824,0.000000,18.242043,DESC
1,7,12.625627,0.000000,0.000000,6.475343,ENTY
2,10,53.465525,5.339126,17.262693,29.197039,DESC
3,12,29.855507,1.126969,0.000000,30.417258,ENTY
4,7,28.578077,1.670732,3.132457,14.477990,ABBR
...,...,...,...,...,...,...
5447,9,20.895701,7.445157,2.726126,23.981544,ENTY
5448,8,36.319544,7.074357,2.619847,17.918298,ENTY
5449,5,8.542456,1.670732,3.132457,7.550124,NUM
5450,6,13.246662,1.670732,3.132457,9.910685,NUM


In [14]:
explore_features(tr_df_list[0])

For Length
	Min: 2
	Max: 33
	Unique: 28
For Lex_Prob_Scores_1gram
	Min: 0.0
	Max: 145.28172318224554
	Unique: 4437
For Lex_Prob_Scores_2gram
	Min: 0.0
	Max: 22.95301614402827
	Unique: 1857
For Lex_Prob_Scores_3gram
	Min: 0.0
	Max: 32.44418642051975
	Unique: 410
For Syntactic_Prob_Scores
	Min: 0.0
	Max: 132.7605092643911
	Unique: 3290


# Training and testing

In [None]:
# applying kfold for training data and getting evaluation metrics

dataset = tr_df_list[0]
train_set, validn_set = apply_k_fold(dataset,10)

classf_list = []
result_kfolds = []

mod = 'gini'

for i in range(1, 10):
    
    print("\n\n\n********************* Kfold iteration",i,"*********************\n\n")
    train_set_i = dataset.iloc[list(train_set[i]),:]
    validation_set_i = dataset.iloc[list(validn_set[i
                                                    
    c = get_trained_classfier(train_set_i,mod)
    classf_list.append(c)
                                                    
    r = prediction_results(c,validation_set_i)
    result_kfolds.append(r)

In [None]:
# Getting evaluation metrics for all three models over different combination of features

model_set = ['gini','misclsfn_err','cross_entropy']
# model_scores = {}                # (key,value) = (accuracy,confusion matrix)
classifers_model_wise = []
results_model_wise = []
for model in model_set:
    print("\n\n\n\n\n*********************************** For ",model,"index *****************************************\n\n\n\n\n")
    classifers_feature_wise = []
    results_feature_wise = []
#     if model == 'gini':
#         skip = 1
    for i in [0,1,2,3]:
#         if i in [1,2,3] and skip == 1:
#             continue
        print("\n\n********************* SET ",i,"*********************\n\n\n\n\n")
        clsfier = get_trained_classfier(tr_df_list[i],model)
        classifers_feature_wise.append(clsfier)
        
        res = prediction_results(clsfier,ts_df_list[i])
        results_feature_wise.append(res)
    
    classifers_model_wise.append(classifers_feature_wise)
    results_model_wise.append(results_feature_wise)

In [None]:
# creating all possible combinations of feature indexes
li = [2,4,6,8,10]
list_comb = []
for L in range(0, len(li)+1):
    for subset in itertools.combinations(li, L):
        if len(subset) > 1:
            lis = list(subset)
            lis.append(0)
            list_comb.append(lis)

In [None]:
# checking feature ablation results
cols = list(total_df.columns)
feature_ablation_results =[]
for lis in list_comb:
#     print("Considering feature set")
#     for index in lis:
#         print(cols[])
    print("\n******************************* Considering list set",lis,"**************************\n\n")
    trds = total_df.iloc[0:5452,lis].copy()
    tsds = total_df.iloc[5452:5953,lis].copy()
    feature_ablation_results.append(prediction_results(get_trained_classfier(trds,'gini'),tsds))