In [156]:
import pandas as pd
import numpy as np
import os
import math
import queue
import nltk
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

Open Files

In [157]:
train_file_x_loc = 'data_2\\Dataset2_train\X_train.xlsx'
train_file_y_loc = 'data_2\\Dataset2_train\y_train.xlsx'
test_file_loc = 'data_2\Dataset2_test\X_test.xlsx'

train_data_x = pd.read_excel(train_file_x_loc)
train_data_y = pd.read_excel(train_file_y_loc)
test_data = pd.read_excel(test_file_loc)

KeyboardInterrupt: 

Convert to numpy array

In [None]:
tr_x = train_data_x.values
tr_y = train_data_y.values
test = test_data.values

tr_x = tr_x.flatten()
tr_y = tr_y.flatten()
test = test.flatten()

In [None]:
reviews = []
for raw in tqdm(train_data_x['Phrase']):
    text = BeautifulSoup(raw).get_text()
    only_text = re.sub('[^a-zA-Z]', ' ', text)
    words = word_tokenize(only_text.lower())
    stops = set(stopwords.words('english'))
    non_stopwords = [word for word in words if not word in stops]
    lemma_words = [lemmatizer.lemmatize(word) for word in non_stopwords]    
    reviews.append(lemma_words)

100%|██████████| 124848/124848 [02:54<00:00, 715.90it/s]


In [None]:
unique_words = set()
len_max = 0
for sent in tqdm(reviews):
    unique_words.update(sent)
    if len_max < len(sent):
        len_max = len(sent)
len(list(unique_words)), len_max

tokenizer = Tokenizer(num_words=len(list(unique_words)))
tokenizer.fit_on_texts(list(reviews))
    
X_train = tokenizer.texts_to_sequences(reviews)
X_train = sequence.pad_sequences(X_train, maxlen=len_max)



100%|██████████| 124848/124848 [00:00<00:00, 264897.51it/s]


Entropy calculating function

In [None]:
labels = [0,1,2,3,4]

count = np.bincount(tr_y)

# Take bin count of each label value as input
def get_entropy(count):
    num_data = np.sum(count)
    total = 0

    for i in count:
        prob = i / num_data
        if prob == 0:
            continue
        total += prob * np.log2(prob)

    return -total

124848


Class for the decision tree

In [None]:
class Node:
    def __init__(self):
        self.left = None
        self.right = None
        self.data = None    # Feature values, including labels
        self.f_in = None    # The index of the traversal criteria
        self.thresh = None  # The value threshold of the traversal criteria
        self.label = None   # Label with the max amount
    
    def __init__(self, data):
        self.left = None
        self.right = None
        self.data = data
        self.f_in = None
        self.thresh = None
        self.label = None

Create Decision Tree:

In [None]:
cur_data = X_train
num_features = len(X_train[0])
temp = []

# Append target feature to cur_data
for i in range(len(cur_data)):
    temp.append( np.append(cur_data[i],int(tr_y[i])) )
cur_data = np.array(temp)

root = Node(cur_data)
cur_node = root

q = queue.Queue() # A queue for processing nodes of the decision tree
q.put(cur_node)


# # # LOOP

while q.empty() == False:

    cur_node = q.get()
    cur_data = cur_node.data

    print(len(cur_data))

    # If data length is 0, continue
    if len(cur_data) == 0:
        continue
    
    cur_label = [int(temp) for temp in cur_data[:,num_features]]
    temp_count = np.bincount(cur_label)

    # Assign the selected label to the current node
    cur_node.label = np.argmax(temp_count)

    # If all data has the same label, stop
    areAllLabelSame = False

    for temp in temp_count:
        if temp == sum(temp_count):
            areAllLabelSame = True
            break
    if areAllLabelSame:
        continue

    # We want to get (feature, val) that minimizes IG
    split_feat_in = 0 
    split_thresh = 0
    min_ent = np.Inf

    # Go through each feature, sort the values
    # i = current feature index
    for i in range(num_features):  # range(num_features)
        cur_feat_val = np.sort(cur_data[:,i])

        # Iterate through each sorted value 
        for j in range(len(cur_feat_val) - 1):

            # Get every unique feature values
            if (cur_feat_val[j] !=  cur_feat_val[j+1]):
                cur_thresh = cur_feat_val[j]

                # Split data into two based on cur_thresh, get two labels
                label1 = []
                label2 = []
                for temp_dat in cur_data:
                    if temp_dat[i] <= cur_thresh:
                        label1.append(int(temp_dat[num_features])) # Append the label
                    else:
                        label2.append(int(temp_dat[num_features]))
                
                label1 = np.array(label1)
                label2 = np.array(label2)

                count1 = np.bincount(label1)
                count2 = np.bincount(label2)

                num_labels_1 = np.sum(count1)
                num_labels_2 = np.sum(count2)

                w_avg_ent = (num_labels_1 / (num_labels_1 + num_labels_2)) * get_entropy(count1) + (num_labels_2 / (num_labels_1 + num_labels_2)) * get_entropy(count2)
                
                # Maximizing IG == minimizing w_avg_ent

                if w_avg_ent < min_ent:
                    min_ent = w_avg_ent
                    split_feat_in = i
                    split_thresh = cur_thresh
    
    # Split data into two
    data_left = []
    data_right = []

    # For each data put in either left / right
    for temp_dat in cur_data:
        if temp_dat[split_feat_in] <= split_thresh:
            data_left.append(temp_dat)
        else:
            data_right.append(temp_dat)
    
    data_left = np.array(data_left)
    data_right = np.array(data_right)


    # Update data structure
    cur_node.thresh = split_thresh
    cur_node.f_in = split_feat_in

    cur_node.left = Node(data_left)
    cur_node.right = Node(data_right)

    q.put(cur_node.left)
    q.put(cur_node.right)

    #print(cur_node.thresh)

124848


KeyboardInterrupt: 

Testing

In [None]:
test_reviews = []
for raw in tqdm(test['Phrase']):
    text = BeautifulSoup(raw).get_text()
    only_text = re.sub('[^a-zA-Z]', ' ', text)
    words = word_tokenize(only_text.lower())
    stops = set(stopwords.words('english'))
    non_stopwords = [word for word in words if not word in stops]
    lemma_words = [lemmatizer.lemmatize(word) for word in non_stopwords]    
    test_reviews.append(lemma_words)