In [30]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score, confusion_matrix

from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

import string
import operator
from copy import deepcopy
from math import log2
from statistics import mean
from collections import Counter

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [31]:
def get_ngrams(data, n):
    tokens = [token for token in data.split(" ") if token != ""]
    return list(ngrams(tokens, n))

def get_postag(txt):
    stop_words = set(stopwords.words('english'))
    tokenized = sent_tokenize(txt)
    words_list = nltk.word_tokenize(tokenized[0]) 
    words_list = [w for w in words_list if not w in stop_words]  
    return nltk.pos_tag(words_list)

def build_data(path_to_data):
    data,uni,bi,tri,pos = [],[],[],[],[]
    
    file = open(path_to_data, encoding = 'unicode_escape')

    for line in file:
        line = line.split(':')
        row = []
        
        _class, _question = line[0], line[1]
        row.append(_class)
        row.append(' '.join(_question.split(' ')[1:]).translate(str.maketrans('', '', string.punctuation)).rstrip())

        length = len(row[1].split(' '))
        row.append(length)

        unigram = get_ngrams(row[1], 1)
        row.append(unigram)
        uni.extend(unigram)

        bigram = get_ngrams(row[1], 2)
        row.append(bigram)
        bi.extend(bigram)
        
        trigram = get_ngrams(row[1], 3)
        row.append(trigram)
        tri.extend(trigram)

        postag = get_postag(row[1])
        row.append(postag)
        pos.extend(postag)
        
        # complete set of features for each text 
        data.append(row)

    return data, uni, bi, tri, pos

# load training data
data, uni, bi, tri, pos = build_data('train_ai09.txt')
print('Loading Training data...')
print('Training Data:')
print(data[0:5])

Loading Training data...
Training Data:
[['DESC', 'How did serfdom develop in and then leave Russia', 9, [('How',), ('did',), ('serfdom',), ('develop',), ('in',), ('and',), ('then',), ('leave',), ('Russia',)], [('How', 'did'), ('did', 'serfdom'), ('serfdom', 'develop'), ('develop', 'in'), ('in', 'and'), ('and', 'then'), ('then', 'leave'), ('leave', 'Russia')], [('How', 'did', 'serfdom'), ('did', 'serfdom', 'develop'), ('serfdom', 'develop', 'in'), ('develop', 'in', 'and'), ('in', 'and', 'then'), ('and', 'then', 'leave'), ('then', 'leave', 'Russia')], [('How', 'WRB'), ('serfdom', 'JJ'), ('develop', 'VB'), ('leave', 'JJ'), ('Russia', 'NNP')]], ['ENTY', 'What films featured the character Popeye Doyle', 7, [('What',), ('films',), ('featured',), ('the',), ('character',), ('Popeye',), ('Doyle',)], [('What', 'films'), ('films', 'featured'), ('featured', 'the'), ('the', 'character'), ('character', 'Popeye'), ('Popeye', 'Doyle')], [('What', 'films', 'featured'), ('films', 'featured', 'the'), ('

In [32]:
def top_grams(grams, top_n):
    return Counter(grams).most_common(top_n)

unigram_counts = top_grams(uni, 500)
bigram_counts = top_grams(bi, 300)
trigram_counts = top_grams(tri, 200)
pos_counts = top_grams(pos, 500)

avg_length = mean([row[2] for row in data])
print('average length:',avg_length)

# Displaying the top features
print('Top features:\n')
print('Unigrams:\n')
print(unigram_counts[0:5])

print('Bigrams:\n')
print(bigram_counts[0:5])

print('Trigrams:\n')
print(trigram_counts[0:5])

#
print('Pos Counts:\n')
print(pos_counts[0:5])

average length: 9.031548055759353
Top features:

Unigrams:

[(('the',), 3589), (('What',), 3245), (('is',), 1669), (('of',), 1540), (('in',), 1131)]
Bigrams:

[(('What', 'is'), 968), (('is', 'the'), 757), (('of', 'the'), 446), (('in', 'the'), 326), (('How', 'many'), 316)]
Trigrams:

[(('What', 'is', 'the'), 551), (('What', 'is', 'a'), 151), (('What', 's', 'the'), 135), (('What', 'are', 'the'), 134), (('What', 'was', 'the'), 130)]
Pos Counts:

[(('What', 'WP'), 3245), (('How', 'WRB'), 763), (('Who', 'WP'), 559), (('many', 'JJ'), 332), (('Where', 'WRB'), 273)]


In [33]:
def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

header = ['Label', 'Text', 'Length', 'Unigram', 'Bigram', 'Trigram', 'POS']

class Question:
    def __init__(self, col, value):
        self.col = col # The column number in the header
        self.value = value # Actual value of the object

    # Matching attributes of current question with the current row
    def match(self, example):
        val = example[self.col]
        if is_numeric(val):
            return val <= self.value
        
        return self.value in val

    # Return the string representation of the object
    def __repr__(self):
        condition = "contains"
        return "Does %s %s %s?" % (
            header[self.col], condition, str(self.value))

In [34]:
def class_counts(rows):
    counts = {}
    for row in rows:
        label = row[0]

        if label not in counts:
            counts[label] = 0
        
        counts[label] += 1
    return counts

def gini(rows):
    counts = class_counts(rows)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity -= prob_of_lbl**2
    return impurity

def misclassifcation_error(rows):
    counts = class_counts(rows)
    max_prob = 0
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        if prob_of_lbl > max_prob:
            max_prob = prob_of_lbl
    return 1 - max_prob

def entropy(rows):
    counts = class_counts(rows)
    impurity = 0
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity -= prob_of_lbl*log2(prob_of_lbl)
    return impurity

def info_gain(left, right, current_uncertainty, func):
    p = float(len(left))/(len(left)+len(right))
    return current_uncertainty - p*func(left) - (1-p)*func(right)

In [35]:
class Leaf:
    def __init__(self, rows):
        self.predictions = class_counts(rows)

In [36]:
class Decision_Node:
    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [37]:
questions = []

for x in unigram_counts:
    questions.append(Question(3, x[0]))

for x in bigram_counts:
    questions.append(Question(4, x[0]))
    
for x in trigram_counts:
    questions.append(Question(5, x[0]))

for x in pos_counts:
    questions.append(Question(6, x[0]))
    
questions.append(Question(2, avg_length))    
    
print(len(questions))
print(questions[0])

1501
Does Unigram contains ('the',)?


In [38]:
# Returns a split of true rows and false rows for a particular question (single feature)
def partition(rows, question):
    rows_true = []
    rows_false = []
    
    for r in rows:
        if question.match(r):
            rows_true.append(r)
        else:
            rows_false.append(r)
    
    return rows_true, rows_false

In [39]:
def find_best_split(rows, questions, func):   
    best_gain = 0
    best_question = None
    current_uncertainty = func(rows)
    
    for q in questions:
        rows_true, rows_false = partition(rows, q)
        if len(rows_true) == 0 or len(rows_false) == 0:
            continue
        
        gain = info_gain(rows_true, rows_false, current_uncertainty, func) # Calculating the information gain
        # Updating best gain
        if gain >= best_gain:
            best_gain, best_question = gain, q
    
    return best_gain, best_question  

In [40]:
# Recursive Function to form the decision tree
# using partitioning (question list is updated periodically)
def form_tree(rows, questions, func):
    # Find the best gain and best question
    gain, question = find_best_split(rows, questions, func)
    if gain == 0:
        return Leaf(rows)
    
    rows_true, rows_false = partition(rows, question)
    questions.remove(question)
    
    true_branch = form_tree(rows_true, questions, func)
    false_branch = form_tree(rows_false, questions, func)
    
    return Decision_Node(question, true_branch, false_branch)

In [41]:
def classify_row(node, row):
    if isinstance(node, Leaf):
        return node.predictions
    
    if node.question.match(row):
        return classify_row(node.true_branch, row)
    else:
        return classify_row(node.false_branch, row)

In [42]:
def train(data, questions, func):
    return form_tree(data, deepcopy(questions), func)

def classify(root, rows):
    predictions = [max(classify_row(root, r).items(), key=operator.itemgetter(1))[0] for r in rows]
    return predictions

In [43]:
def get_data_in_index(data, index):
    l = []
    for i in range(len(data)):
        if i in index:
            l.append(data[i])
    return l

def get_actual_labels(act_data):
    act_labels = []
    
    for d in act_data:
        act_labels.append(d[0])
    
    return act_labels

In [44]:
kfold = KFold(n_splits=10, shuffle=True)

precision,recall,f_score = [],[],[]
i = 0

for trainInd, testInd in kfold.split(data):
  
    train_data = get_data_in_index(data, trainInd)
    test_data = get_data_in_index(data, testInd)
    
    root = train(train_data, questions, gini)
    prediction = classify(root, test_data)
    actual = get_actual_labels(test_data)
    predicted = prediction
    
    precision.append(precision_score(actual, predicted, average='macro'))
    recall.append(recall_score(actual, predicted, average='macro'))
    f_score.append(f1_score(actual, predicted, average='macro'))
     
    print("Training...")

print('\nGini Index')
print("Precision Score = "+str(mean(precision)))
print("Recall Score = "+str(mean(recall)))
print("F Score = "+str(mean(f_score)))

Training...
Training...
Training...
Training...
Training...
Training...
Training...
Training...
Training...
Training...

Gini Index
Precision Score = 0.7948767257883729
Recall Score = 0.7480175478838001
F Score = 0.7638010583721782


In [47]:
classes = ['ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM']

def getReport(train_data, test_data, uniFlag=True, biFlag=True, triFlag=True, posFlag=True, lenFlag=True, func=gini):
    allQuestions = []
    
    if uniFlag:
        for x in unigram_counts:
            allQuestions.append(Question(3, x[0]))

    if biFlag:
        for x in bigram_counts:
            allQuestions.append(Question(4, x[0]))

    if triFlag:
        for x in trigram_counts:
            allQuestions.append(Question(5, x[0]))

    if posFlag:
        for x in pos_counts:
            allQuestions.append(Question(6, x[0]))

    if lenFlag:
        allQuestions.append(Question(2, avg_length))    

    print("No of questions = " + str(len(allQuestions)))

    print("Training...")
    root = train(train_data, allQuestions, func)
    
    print("Predicting...")
    prediction = classify(root, test_data)        
    actual = get_actual_labels(test_data)
    
    print("Prediction done...")
    matrix = confusion_matrix(actual, prediction)
    class_report = classification_report(actual, prediction)
    acc = matrix.diagonal()/matrix.sum(axis=1)
    accuracy_report = dict(zip(classes, acc))
    
    return accuracy_report, class_report, root, prediction, actual

test_data = build_data('trec_ai09.txt')[0]
print(len(test_data))

500


In [48]:
accuracy_report, class_report, root, prediction, actual = getReport(train_data=data, test_data=test_data)
print(accuracy_report)
print(class_report)

No of questions = 1501
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.9710144927536232, 'ENTY': 0.723404255319149, 'HUM': 0.8461538461538461, 'LOC': 0.7037037037037037, 'NUM': 0.8141592920353983}
              precision    recall  f1-score   support

        ABBR       0.86      0.67      0.75         9
        DESC       0.76      0.97      0.85       138
        ENTY       0.68      0.72      0.70        94
         HUM       0.92      0.85      0.88        65
         LOC       0.89      0.70      0.79        81
         NUM       0.99      0.81      0.89       113

    accuracy                           0.82       500
   macro avg       0.85      0.79      0.81       500
weighted avg       0.84      0.82      0.82       500



In [49]:
accuracy_report, class_report, root, prediction, actual = getReport(train_data=data, test_data=test_data, func=entropy)
print(accuracy_report)
print(class_report)

No of questions = 1501
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.9710144927536232, 'ENTY': 0.5, 'HUM': 0.8615384615384616, 'LOC': 0.7283950617283951, 'NUM': 0.8053097345132744}
              precision    recall  f1-score   support

        ABBR       0.86      0.67      0.75         9
        DESC       0.66      0.97      0.79       138
        ENTY       0.69      0.50      0.58        94
         HUM       0.90      0.86      0.88        65
         LOC       0.88      0.73      0.80        81
         NUM       0.98      0.81      0.88       113

    accuracy                           0.79       500
   macro avg       0.83      0.76      0.78       500
weighted avg       0.81      0.79      0.78       500



In [50]:
accuracy_report, class_report, root, prediction, actual = getReport(train_data=data, test_data=test_data, func=misclassifcation_error)
print(accuracy_report)
print(class_report)

No of questions = 1501
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.8260869565217391, 'ENTY': 0.7978723404255319, 'HUM': 0.8461538461538461, 'LOC': 0.691358024691358, 'NUM': 0.7876106194690266}
              precision    recall  f1-score   support

        ABBR       0.86      0.67      0.75         9
        DESC       0.77      0.83      0.79       138
        ENTY       0.57      0.80      0.66        94
         HUM       0.92      0.85      0.88        65
         LOC       0.92      0.69      0.79        81
         NUM       0.98      0.79      0.87       113

    accuracy                           0.79       500
   macro avg       0.83      0.77      0.79       500
weighted avg       0.82      0.79      0.80       500



In [51]:
accuracy_report, class_report, root, prediction, actual = getReport(train_data=data, test_data=test_data, lenFlag=False)
print(accuracy_report)
print(class_report)

No of questions = 1500
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.9710144927536232, 'ENTY': 0.723404255319149, 'HUM': 0.8461538461538461, 'LOC': 0.7037037037037037, 'NUM': 0.8141592920353983}
              precision    recall  f1-score   support

        ABBR       0.86      0.67      0.75         9
        DESC       0.76      0.97      0.85       138
        ENTY       0.68      0.72      0.70        94
         HUM       0.92      0.85      0.88        65
         LOC       0.89      0.70      0.79        81
         NUM       0.99      0.81      0.89       113

    accuracy                           0.82       500
   macro avg       0.85      0.79      0.81       500
weighted avg       0.84      0.82      0.82       500



In [52]:
accuracy_report, class_report, root, prediction, actual = getReport(train_data=data, test_data=test_data, lenFlag=False, func=entropy)
print(accuracy_report)
print(class_report)

No of questions = 1500
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.9710144927536232, 'ENTY': 0.5, 'HUM': 0.8615384615384616, 'LOC': 0.7283950617283951, 'NUM': 0.8053097345132744}
              precision    recall  f1-score   support

        ABBR       0.86      0.67      0.75         9
        DESC       0.66      0.97      0.79       138
        ENTY       0.69      0.50      0.58        94
         HUM       0.90      0.86      0.88        65
         LOC       0.88      0.73      0.80        81
         NUM       0.98      0.81      0.88       113

    accuracy                           0.79       500
   macro avg       0.83      0.76      0.78       500
weighted avg       0.81      0.79      0.78       500



In [53]:
accuracy_report, class_report, root, prediction, actual = getReport(train_data=data, test_data=test_data, lenFlag=False, func=misclassifcation_error)
print(accuracy_report)
print(class_report)

No of questions = 1500
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.8260869565217391, 'ENTY': 0.7978723404255319, 'HUM': 0.8461538461538461, 'LOC': 0.691358024691358, 'NUM': 0.7787610619469026}
              precision    recall  f1-score   support

        ABBR       0.86      0.67      0.75         9
        DESC       0.77      0.83      0.79       138
        ENTY       0.56      0.80      0.66        94
         HUM       0.92      0.85      0.88        65
         LOC       0.92      0.69      0.79        81
         NUM       0.98      0.78      0.87       113

    accuracy                           0.79       500
   macro avg       0.83      0.77      0.79       500
weighted avg       0.82      0.79      0.80       500



In [54]:
accuracy_report, class_report, root, prediction, actual = getReport(train_data=data, test_data=test_data, lenFlag=False, posFlag=False)
print(accuracy_report)
print(class_report)

No of questions = 1000
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.9782608695652174, 'ENTY': 0.6276595744680851, 'HUM': 0.8461538461538461, 'LOC': 0.654320987654321, 'NUM': 0.7699115044247787}
              precision    recall  f1-score   support

        ABBR       0.86      0.67      0.75         9
        DESC       0.73      0.98      0.84       138
        ENTY       0.60      0.63      0.61        94
         HUM       0.87      0.85      0.86        65
         LOC       0.88      0.65      0.75        81
         NUM       1.00      0.77      0.87       113

    accuracy                           0.79       500
   macro avg       0.82      0.76      0.78       500
weighted avg       0.81      0.79      0.79       500



In [55]:
accuracy_report, class_report, root, prediction, actual = getReport(train_data=data, test_data=test_data, lenFlag=False, posFlag=False, func=entropy)
print(accuracy_report)
print(class_report)

No of questions = 1000
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.427536231884058, 'ENTY': 0.648936170212766, 'HUM': 0.8769230769230769, 'LOC': 0.6296296296296297, 'NUM': 0.7699115044247787}
              precision    recall  f1-score   support

        ABBR       0.86      0.67      0.75         9
        DESC       0.57      0.43      0.49       138
        ENTY       0.35      0.65      0.45        94
         HUM       0.93      0.88      0.90        65
         LOC       0.82      0.63      0.71        81
         NUM       0.97      0.77      0.86       113

    accuracy                           0.64       500
   macro avg       0.75      0.67      0.69       500
weighted avg       0.71      0.64      0.66       500



In [57]:
accuracy_report, class_report, root, prediction, actual = getReport(train_data=data, test_data=test_data, lenFlag=False, posFlag=False, func=misclassifcation_error)
print(accuracy_report)
print(class_report)

No of questions = 1000
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.8188405797101449, 'ENTY': 0.7340425531914894, 'HUM': 0.8, 'LOC': 0.654320987654321, 'NUM': 0.7876106194690266}
              precision    recall  f1-score   support

        ABBR       0.86      0.67      0.75         9
        DESC       0.75      0.82      0.78       138
        ENTY       0.50      0.73      0.60        94
         HUM       0.96      0.80      0.87        65
         LOC       0.88      0.65      0.75        81
         NUM       0.98      0.79      0.87       113

    accuracy                           0.76       500
   macro avg       0.82      0.74      0.77       500
weighted avg       0.81      0.76      0.77       500



In [58]:
def get_wrong_prediction(prediction, actual, dataset):
    data_list = [dataset[i] for i in range(len(prediction)) if prediction[i] != actual[i]]
    return data_list

In [59]:
_ , class_matrix, root_gini, prediction_gini, actual_gini  = getReport(train_data=data, test_data=test_data)
wrong_data = get_wrong_prediction(prediction_gini, actual_gini, test_data)

No of questions = 1501
Training...
Predicting...
Prediction done...


In [60]:
# Printing the wrong data length
print('Len of wrong data for gini', len(wrong_data))

Len of wrong data for gini 88


In [61]:
_ , class_matrix, root_entropy, prediction_entropy, actual_entropy  = getReport(train_data=data, test_data=wrong_data, func=entropy)
wrong_data_en = get_wrong_prediction(prediction_entropy, actual_entropy, wrong_data)
print('Len of wrong data for entropy is', len(wrong_data_en))

No of questions = 1501
Training...
Predicting...
Prediction done...
Len of wrong data for entropy is 78


In [62]:
_ , class_matrix, root_mis, prediction_mis, actual_mis  = getReport(train_data=data, test_data=wrong_data, func=misclassifcation_error)
wrong_data_mis = get_wrong_prediction(prediction_entropy, actual_entropy, wrong_data)
print('Len of wrong data for misclassifcation_error is', len(wrong_data_mis))

No of questions = 1501
Training...
Predicting...
Prediction done...
Len of wrong data for misclassifcation_error is 78


In [78]:
print('Entropy correctly classifies', (len(wrong_data) - len(wrong_data_en)), ' more records as compared to GINI metric')
_entropyVsGini = ((len(wrong_data) - len(wrong_data_en)) / len(wrong_data)) * 100
print('Percentage of samples corrected by Entropy over GINI Index = ' + str(_entropyVsGini)  )

Entropy correctly classifies 10  more records as compared to GINI metric
Percentage of samples corrected by Entropy over GINI Index = 11.363636363636363


In [79]:
print('Misclassification error correctly classifies', (len(wrong_data) - len(wrong_data_mis)), ' more records as compared to GINI metric')
_misclassificationVsGini = ((len(wrong_data) - len(wrong_data_mis)) / len(wrong_data)) * 100
print('Percentage of samples corrected by Misclassification over GINI Index = ' + str(_misclassificationVsGini)  )

Misclassification error correctly classifies 10  more records as compared to GINI metric
Percentage of samples corrected by Misclassification over GINI Index = 11.363636363636363


In [None]:
accuracy_report, class_report, root, prediction, actual = getReport(train_data, test_data, uniFlag=False, biFlag=True, triFlag=True, posFlag=True, lenFlag=True, func=gini)
print(accuracy_report)
print(class_report)


In [82]:
accuracy_report, class_report, root, prediction, actual = getReport(train_data, test_data, uniFlag=False, biFlag=True, triFlag=True, posFlag=True, lenFlag=True, func=entropy)
print(accuracy_report)
print(class_report)


No of questions = 1001
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.9710144927536232, 'ENTY': 0.574468085106383, 'HUM': 0.8, 'LOC': 0.6172839506172839, 'NUM': 0.7079646017699115}
              precision    recall  f1-score   support

        ABBR       0.75      0.67      0.71         9
        DESC       0.69      0.97      0.81       138
        ENTY       0.53      0.57      0.55        94
         HUM       0.90      0.80      0.85        65
         LOC       0.88      0.62      0.72        81
         NUM       0.98      0.71      0.82       113

    accuracy                           0.75       500
   macro avg       0.79      0.72      0.74       500
weighted avg       0.78      0.75      0.75       500

