In [1]:
import pandas as pd
import numpy as np

app_id = 1190460
data_path = 'data/'

positive_reviews = pd.read_csv(data_path + 'review_%d_positive_all.csv' % app_id)
negative_reviews = pd.read_csv(data_path + 'review_%d_negative_all.csv' % app_id)

# Unlabeled reviews which we will evaluate - a sample extracted from the most recent reviews
evaluation_data = pd.read_csv(data_path + 'review_%d_all_recent.csv' % app_id)

summary = pd.read_csv(data_path + 'summary_%d.csv' % app_id)

In [2]:
from enum import Enum
class Sentiment(Enum):
    POSITIVE = 'positive'
    NEGATIVE = 'negative'
    NEUTRAL = 'neutral'

numeric_features = [
    'votes_up',
    'votes_funny',
    'weighted_vote_score',
    'author_num_games_owned',
    'author_playtime_at_review',
]

# Label our datasets
positive_reviews['label'] = Sentiment.POSITIVE
negative_reviews['label'] = Sentiment.NEGATIVE

# Create the batch of training data
training_data = positive_reviews.append(negative_reviews)

In [3]:
def gini_index(groups, classes):
    total_count = float(sum([len(group) for group in groups]))
    gini = 0.0

    for group in groups:
        if len(group) == 0:
            continue

        size = float(len(group))
        score = 0.0

        for value in classes:
            proportion = [row.label for row in group.itertuples()].count(value) / size
            score += proportion * proportion

        # weight the group score by its relative size
        gini += (1.0 - score) * (size / total_count)

    return gini

# perfectly balanced, so should give a gini index of 0
gini_index([positive_reviews, negative_reviews], Sentiment)

0.0

In [4]:
def split(data, attribute, value):
    left, right = [], []
    for index, row in data.iterrows():
        if row[attribute] < value:
            left.append(index)
        else:
            right.append(index)

    return left, right

# l, r = split(training_data, 'author_playtime_forever', 240)

In [5]:
def get_subset(indices, data):
    """Gets a subset of the Pandas DataFrame in data, based on
    the indices."""
    if not indices:
        return pd.DataFrame()

    if data.empty:
        return pd.DataFrame()

    try:
        return data.iloc[indices, :]
    except:
        print('indices:', indices)
        return pd.DataFrame()

def get_best_split(data):
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    for attribute_name in numeric_features:
        for _, row in data.iterrows():
            # get a list of list of indices
            groups = split(data, attribute_name, row[attribute_name])

            gini = gini_index(
                # repopulate the list using the indices and the data set
                [get_subset(group, data) for group in groups],
                Sentiment
            )

            # if we found a better gini index, checkpoint all the best values
            if gini < b_score:
                b_index, b_value, b_score, b_groups = attribute_name, row[attribute_name], gini, groups

    return {
        'index': b_index,
        'value': b_value,
        'groups': b_groups,
    }

# get_best_split(training_data)

In [7]:
def to_terminal(indices, training_data):
    group = get_subset(indices, training_data)
    labels = [row['label'] for _, row in group.iterrows()]
    return max(set(labels), key=labels.count)

to_terminal([1, 2, 3], training_data)

<Sentiment.POSITIVE: 'positive'>

In [8]:
def tree_split(node, max_depth, min_size, depth, training_data):
    left, right = node['groups']
    del node['groups']

    # check for a no split
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right, training_data)
        return

    # check for max depth
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left, training_data), to_terminal(right, training_data)
        return

    # process left child
    if len(left) <= min_size:
        node['left'] = to_terminal(left, training_data)
    else:
        node['left'] = get_best_split(
            get_subset(left, training_data)
        )
        tree_split(node['left'], max_depth, min_size, depth + 1, training_data)

    # process right child
    if len(right) <= min_size:
        node['right'] = to_terminal(right, training_data)
    else:
        node['right'] = get_best_split(
            get_subset(right, training_data)
        )
        tree_split(node['right'], max_depth, min_size, depth + 1, training_data)

def build_tree(training_data, max_depth, min_size):
    root = get_best_split(training_data)
    tree_split(root, max_depth, min_size, 1, training_data)
    return root

root = build_tree(training_data, 10, 1)
root

indices: [38, 39, 40]
indices: [37, 0, 1, 2, 3]
indices: [37, 38, 39, 40, 0, 1, 2, 3]
indices: [37, 38, 39, 40, 0, 1, 2, 3]
indices: [37, 38, 39, 40, 0, 1, 2, 3]
indices: [37, 38, 39, 40, 1, 2, 3]
indices: [37, 38, 39, 40, 2, 3]
indices: [37, 38, 39, 40]
indices: [37, 38, 39, 40, 2]
indices: [37, 38, 39, 40, 0, 1, 2, 3]
indices: [37, 38, 39, 40, 0, 1, 2, 3]
indices: [37, 38, 39, 40, 0, 1, 2, 3]
indices: [37, 38, 39, 40, 0, 1, 2, 3]
indices: [37, 38, 39, 40, 1, 2, 3]
indices: [37, 38, 39, 40, 2, 3]
indices: [37, 38, 39, 40, 0, 1, 2, 3]
indices: [37, 38, 39, 40, 2]
indices: [38, 39, 40]
indices: [37, 0, 1, 2, 3]
indices: [37, 38, 39, 40, 0, 1, 2, 3]
indices: [37, 38, 39, 40, 0, 1, 2, 3]
indices: [37, 38, 39, 40, 0, 1, 2, 3]
indices: [37, 38, 39, 40, 1, 2, 3]
indices: [37, 38, 39, 40, 2, 3]
indices: [37, 38, 39, 40, 3]
indices: [37, 38, 39, 40]
indices: [38, 39, 40, 1, 2, 3]
indices: [37, 0]
indices: [37, 38, 39, 40, 0, 3]
indices: [38, 1, 2]
indices: [37, 39, 40, 0, 3]
indices: [38, 39, 

indices: [12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92, 12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92]
indices: [12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92, 12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92]
indices: [12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92, 12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92]
indices: [12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92, 12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92]
indices: [12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92, 12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86

indices: [12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92, 12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92]
indices: [12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92, 12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92]
indices: [12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92, 12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92]
indices: [12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92, 12, 13, 14, 15, 16, 17, 18, 20, 23, 24, 25, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92]
indices: [12, 13, 14, 15, 17, 23, 24, 31, 32, 44, 66, 74, 76, 81, 85, 86, 92, 12, 13, 14, 15, 17, 23, 24, 31, 32, 44, 66, 74, 76, 81, 85, 86, 92]
indices: [16, 18, 20, 25, 63, 67, 91, 16, 18, 

indices: [12, 13, 14, 15, 16, 17, 18, 24, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92, 12, 13, 14, 15, 16, 17, 18, 24, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 86, 91, 92]
indices: [20, 23, 25, 76, 86, 92, 20, 23, 25, 76, 86, 92]
indices: [12, 13, 14, 15, 16, 17, 18, 24, 31, 32, 44, 63, 66, 67, 74, 81, 85, 91, 12, 13, 14, 15, 16, 17, 18, 24, 31, 32, 44, 63, 66, 67, 74, 81, 85, 91]
indices: [20, 23, 25, 86, 20, 23, 25, 86]
indices: [12, 13, 14, 15, 16, 17, 18, 24, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 91, 92, 12, 13, 14, 15, 16, 17, 18, 24, 31, 32, 44, 63, 66, 67, 74, 76, 81, 85, 91, 92]
indices: [13, 15, 17, 18, 20, 23, 25, 31, 32, 66, 74, 76, 86, 91, 92, 13, 15, 17, 18, 20, 23, 25, 31, 32, 66, 74, 76, 86, 91, 92]
indices: [12, 14, 16, 24, 44, 63, 67, 81, 85, 12, 14, 16, 24, 44, 63, 67, 81, 85]
indices: [15, 17, 18, 20, 23, 25, 31, 32, 66, 74, 76, 86, 91, 92, 15, 17, 18, 20, 23, 25, 31, 32, 66, 74, 76, 86, 91, 92]
indices: [12, 13, 14, 16, 24, 44, 63, 67, 81, 85, 12, 13, 14, 16

indices: [7, 9, 21, 37, 46, 54, 60, 79, 7, 9, 21, 46, 54, 60, 79]
indices: [7, 9, 21, 37, 46, 54, 60, 79, 7, 9, 21, 46, 54, 60, 79]
indices: [7, 9, 21, 37, 46, 54, 60, 79, 7, 9, 21, 46, 54, 60, 79]
indices: [7, 9, 21, 37, 46, 54, 60, 79, 7, 9, 21, 46, 54, 60, 79]
indices: [7, 9, 21, 37, 46, 54, 60, 79, 7, 9, 21, 46, 54, 60, 79]
indices: [7, 9, 21, 37, 46, 54, 60, 79, 7, 9, 21, 46, 54, 60, 79]
indices: [7, 9, 21, 37, 46, 54, 60, 79, 7, 9, 21, 46, 54, 60, 79]
indices: [7, 9, 21, 37, 46, 54, 60, 79, 7, 9, 21, 46, 54, 60, 79]
indices: [7, 9, 21, 37, 46, 54, 60, 79, 7, 9, 21, 46, 54, 60, 79]
indices: [7, 9, 21, 37, 46, 54, 60, 79, 7, 9, 21, 46, 54, 60, 79]
indices: [7, 9, 21, 37, 46, 54, 60, 79, 7, 9, 21, 46, 54, 60, 79]
indices: [7, 9, 21, 37, 46, 54, 60, 79, 7, 9, 21, 46, 54, 60, 79]
indices: [9, 21, 37, 46, 54, 60, 79, 9, 21, 46, 54, 60, 79]
indices: [7, 9, 21, 37, 46, 54, 60, 79, 7, 9, 21, 46, 54, 60, 79]
indices: [7, 9, 21, 37, 46, 54, 60, 79, 7, 9, 21, 46, 54, 60, 79]
indices: [7, 9, 

indices: [8, 10, 11, 89, 8, 10, 11, 89]
indices: [6, 19, 34, 59, 6, 19, 34, 59]
indices: [8, 10, 11, 19, 89, 8, 10, 11, 19, 89]
indices: [6, 34, 59, 6, 34, 59]
indices: [8, 10, 11, 19, 34, 89, 8, 10, 11, 19, 34, 89]
indices: [6, 59, 6, 59]
indices: [6, 10, 19, 34, 59, 89, 6, 10, 19, 34, 59, 89]
indices: [10, 11, 19, 34, 89, 10, 11, 19, 34, 89]
indices: [6, 8, 59, 6, 8, 59]
indices: [6, 10, 11, 19, 34, 89, 6, 10, 11, 19, 34, 89]
indices: [8, 59, 8, 59]
indices: [6, 8, 10, 11, 19, 34, 59, 89, 6, 8, 10, 11, 19, 34, 59, 89]
indices: [10, 19, 34, 89, 10, 19, 34, 89]
indices: [6, 8, 11, 59, 6, 8, 11, 59]
indices: [10, 89, 10, 89]
indices: [6, 8, 11, 19, 34, 59, 6, 8, 11, 19, 34, 59]
indices: [10, 19, 89, 10, 19, 89]
indices: [6, 8, 11, 34, 59, 6, 8, 11, 34, 59]
indices: [6, 8, 10, 11, 19, 34, 89, 6, 8, 10, 11, 19, 34, 89]
indices: [59, 59]
indices: [6, 8, 11, 19, 34, 59, 89, 6, 8, 11, 19, 34, 59, 89]
indices: [10, 11, 19, 34, 89, 10, 11, 19, 34, 89]
indices: [6, 8, 59, 6, 8, 59]
indices: [6,

{'index': 'weighted_vote_score',
 'value': 0.5018867850303649,
 'left': {'index': 'votes_up',
  'value': 4,
  'left': {'index': 'votes_up',
   'value': 2,
   'left': <Sentiment.POSITIVE: 'positive'>,
   'right': <Sentiment.POSITIVE: 'positive'>},
  'right': {'index': 'votes_up',
   'value': 4,
   'left': <Sentiment.POSITIVE: 'positive'>,
   'right': <Sentiment.POSITIVE: 'positive'>}},
 'right': {'index': 'votes_up',
  'value': 87,
  'left': {'index': 'votes_up',
   'value': 43,
   'left': {'index': 'votes_up',
    'value': 40,
    'left': {'index': 'votes_up',
     'value': 36,
     'left': {'index': 'votes_up',
      'value': 7,
      'left': {'index': 'votes_up',
       'value': 5,
       'left': {'index': 'votes_up',
        'value': 4,
        'left': {'index': 'votes_up',
         'value': 3,
         'left': {'index': 'votes_up',
          'value': 2,
          'left': <Sentiment.POSITIVE: 'positive'>,
          'right': <Sentiment.POSITIVE: 'positive'>},
         'right': {'inde

In [15]:
def print_tree(node, depth=0):
    if isinstance(node, dict):
        print('%s[X.%s < %.3f]' % (depth * ' ', node['index'], node['value']))
        print_tree(node['left'], depth + 1)
        print_tree(node['right'], depth + 1)
    else:
        print('%s[%s]' % ((depth *' ', node)))

print_tree(root)

[X.weighted_vote_score < 0.502]
 [X.votes_up < 4.000]
  [X.votes_up < 2.000]
   [Sentiment.POSITIVE]
   [Sentiment.POSITIVE]
  [X.votes_up < 4.000]
   [Sentiment.POSITIVE]
   [Sentiment.POSITIVE]
 [X.votes_up < 87.000]
  [X.votes_up < 43.000]
   [X.votes_up < 40.000]
    [X.votes_up < 36.000]
     [X.votes_up < 7.000]
      [X.votes_up < 5.000]
       [X.votes_up < 4.000]
        [X.votes_up < 3.000]
         [X.votes_up < 2.000]
          [Sentiment.POSITIVE]
          [Sentiment.POSITIVE]
         [X.votes_up < 3.000]
          [Sentiment.POSITIVE]
          [Sentiment.POSITIVE]
        [X.votes_up < 4.000]
         [Sentiment.POSITIVE]
         [Sentiment.POSITIVE]
       [X.votes_up < 5.000]
        [Sentiment.POSITIVE]
        [Sentiment.POSITIVE]
      [X.votes_up < 7.000]
       [Sentiment.POSITIVE]
       [Sentiment.POSITIVE]
     [X.votes_up < 36.000]
      [Sentiment.POSITIVE]
      [Sentiment.POSITIVE]
    [X.votes_up < 40.000]
     [Sentiment.POSITIVE]
     [Sentiment.POSIT