In [None]:
require './final_project_lib'
require './metrics.rb'
require './transformers.rb'
require './decision_trees.rb'

In [None]:
# mean
def mean x
  x.sum(0.0) / x.size
end

# stdev
def stdev x
  m = mean(x)
  variance = 0.0
  x.each{|v| variance += (v - m) ** 2}
  Math.sqrt(variance / (x.size.to_f - 1))
end

def dot x, w
  prod = 0
  for key in x.keys
    if w.has_key?(key)
      prod += x[key] * w[key]
    end
  end
  prod
end

def norm w
  Math.sqrt(dot(w, w))
end

In [None]:
require 'digest'
class DownsampleNegatives
  attr_reader :sampling_rate
  def initialize sampling_rate
    @sampling_rate = sampling_rate
  end
  
  def train dataset; end
  
  def update_sampling_rate dataset
    class_counts = Hash.new
    dataset["data"]
      .group_by {|e| e["label"] > 0 ? 1 : 0}
      .each {|k,v| class_counts[k] = v.size}
    @sampling_rate = class_counts[1] / class_counts[0].to_f
  end
  
  def hashprob id
    salt = "eifjcchdivlbreckvgndlvkgdtdjnbcnjldelrgefcgt"
    (Digest::MD5.hexdigest(id.to_s + salt).to_i(16) % 100000).abs / 100000.0
  end
  
  def can_keep? example
    can_keep = true
    can_keep = if example["label"] > 0
      true
    elsif hashprob(example["id"]) < @sampling_rate
      true
    else
      false
    end
    return can_keep
  end

  def apply(example_batch)
    return example_batch.select! {|example| can_keep? example}
  end
end

In [None]:
class MinMaxTransformer
  attr_reader :mins, :maxs
  
  def initialize feature_names
    @mins = Hash.new
    @maxs = Hash.new
    @feature_names = feature_names
  end
  
  def train dataset
    @feature_names.each do |feature_name|
      x = dataset["data"].map{ |r| r["features"][feature_name]}.select{|v| v != nil}
      if x.first.kind_of?(String)
        next
      end
      @mins[feature_name] = x.min
      @maxs[feature_name] = x.max
    end
  end
  
  def apply example_batch
    example_batch.each do |example|
      @feature_names.each do |feature_name|
        val = example["features"][feature_name]
        if val == nil ||val.kind_of?(String)
          next
        end
        example["features"][feature_name] = (val - @mins[feature_name]) / (@maxs[feature_name] - @mins[feature_name])
      end
    end
    return example_batch
  end
end

class MeanImputation
  attr_reader :means
  
  def initialize feature_names
    @means = Hash.new
    @feature_names = feature_names
  end
  
  def train dataset    
    # BEGIN YOUR CODE
    @feature_names.each do |feature_name|
      x = dataset["data"].map{ |r| r["features"][feature_name] }.select{ |v| v != nil}
      if !x.first.is_a? Numeric
        next
      end
      @means[feature_name] = mean(x)
    end
    #END YOUR CODE
  end
  
  def apply(example_batch)
    # BEGIN YOUR CODE
     example_batch.each do |example|
      @feature_names.each do |feature_name|
        if example["features"][feature_name] == nil
          example["features"][feature_name] = @means[feature_name]
        end
      end
    end
    #END YOUR CODE
    return example_batch
  end
end



In [None]:
class FeatureTransformPipeline
  def initialize *transformers
    @transformers = transformers
  end
  
  def train dataset
    @transformers.each{|transform| transform.train(dataset)}
  end
  
  def apply example_batch 
    return @transformers.inject(example_batch) do |u, transform|
      
      u = transform.apply example_batch
     
    end
    
    example_batch  
  end
end



In [None]:
#  class AUCMetric
module Metric
  def apply scores
  end
end

class AUCMetric 
  include Metric
  
  def roc_curve(scores)
    fp_rates = [0.0]
    tp_rates = [0.0]
    auc = 0.0
    
    np = scores.inject(0.0) {|u,s| u += s.last}
    nn = scores.inject(0.0) {|u,s| u += (1 - s.last)}
    
    ni_p = 0
    ni_n = 0
    scores.sort_by {|s| -s.first}.each do |s|
      ni_n += 1 if s.last <= 0
      ni_p += 1 if s.last > 0  
      fpr = ni_n / nn
      tpr = ni_p / np
      auc += 0.5 * (tpr + tp_rates.last) * (fpr - fp_rates.last)
      fp_rates.append(fpr)
      tp_rates.append(tpr)
    end
    
    return [fp_rates, tp_rates, auc]
  end
  
  def apply scores
    fp, tp, auc = roc_curve scores
    auc
  end
end

# cross_validate
def cross_validate dataset, folds, &block
  examples = dataset["data"]
  fold_size = examples.size / folds
  folds.times do |fold|
    ##CV training examples
    train_data = dataset.clone
    train_data["data"] = train_data["data"][0, fold * fold_size] + train_data["data"][((fold + 1) * fold_size)..-1]
    
     ##CV testing examples
    test_data = dataset.clone
    test_data["data"] = test_data["data"][fold * fold_size, fold_size]             

    ## Call the callback like this:
    yield train_data, test_data, fold
  end
end



In [None]:
def class_distribution dataset
  # BEGIN YOUR CODE
  res = Hash.new(0.0)
  dataset.each do |row|
    res[row["label"]] += 1
  end
  total = res.each_value.sum(0.0)
  res.each_key do |c|
    res[c] /= total
  end
  return res
  #END YOUR CODE
end

def entropy dist
  # BEGIN YOUR CODE
  entropy = 0.0
  dist.each_value do |v|
    if v == 0
      return 0.0
    end
    entropy -= v * Math.log(v)
  end
  return entropy
  #END YOUR CODE
end

def information_gain h0, splits
  # BEGIN YOUR CODE
  info_gain = h0
  total_size = splits.values.map { |s| s.size }.sum(0.0)
  splits.each_key do |k|
    h = entropy(class_distribution(splits[k]))
    n = splits[k].size
    info_gain -= n / total_size * h
  end
  return info_gain
  #END YOUR CODE
end

def random_features_subset dataset, rng, num_features
  feature_list = dataset["features"].sample(num_features, random: rng)  
end

def random_forest_dataset dataset, rng, num_features
  feature_list = random_features_subset dataset, rng, num_features
  examples = dataset["data"]
  new_dataset = nil
  
  # BEGIN YOUR CODE
  new_dataset = Hash.new
  new_dataset["features"] = feature_list
  new_dataset["data"] = []

  examples.size.times do
    example = examples.sample(random:rng)
    new_features = example["features"].select {|key, value| feature_list.include?(key)}
    new_example = {"id" => example["id"], "features" => new_features, "label" => example["label"]}
    new_dataset["data"].append(new_example)
  end
  #END YOUR CODE
  return new_dataset
end

class CategoricalSplit
  attr_reader :feature_name
  
  def initialize fname
    @feature_name = fname
    @path_pattern = "%s == '%s'"
  end
  
  def to_s
    "Categorical[#{@feature_name}]"
  end

  def split_on_feature examples
    splits = Hash.new {|h, k| h[k] = Array.new}
    
    # BEGIN YOUR CODE
    examples.each do |example|
      feature_value = example["features"][@feature_name]
      if feature_value == nil
        next
      end
      split_name = @path_pattern % [@feature_name, feature_value]
      splits[split_name].append(example)
    end
    #END YOUR CODE
    
    return splits
  end

  def test example    
    # BEGIN YOUR CODE
    feature_value = example["features"][@feature_name]
    if feature_value == nil
      return nil
    end
    path_name = @path_pattern % [@feature_name, feature_value]
    #END YOUR CODE
    
    return path_name
  end
end

class NumericSplit
  attr_reader :feature_name, :split_point, :paths
  def initialize fname, value
    @feature_name = fname
    @split_point = value
    @split_point_str = "%.2g" % @split_point
    @paths = ["#{@feature_name} < #{@split_point_str}", "#{@feature_name} >= #{@split_point_str}"]
  end
  
  def to_s
    "Numeric[#{@feature_name} <=> #{@split_point_str}]"
  end

  def split_on_feature examples
    splits = Hash.new { |h, k| h[k] = [] }

    # BEGIN YOUR CODE
    examples.each do |example|
      feature_value = example["features"][@feature_name]
      if feature_value == nil
        feature_value = 0
      end
      if feature_value >= @split_point
        path = paths[1]
      else
        path = paths[0]
      end
      splits[path].append(example)
    end
    #END YOUR CODE

    return splits
  end
  
  def test example
    # BEGIN YOUR CODE
    feature_value = example["features"][@feature_name]
    if feature_value == nil
        return nil
    end
    if feature_value >= @split_point
        return paths[1]
    else
        return paths[0]
    end
    #END YOUR CODE
  end
end

class CategoricalSplitter
  def matches? examples, feature_name
    has_feature = examples.select {|r| r["features"].has_key? feature_name} 
    return false if has_feature.empty?    
    return has_feature.all? do |r| 
      r["features"].fetch(feature_name, 0.0).is_a?(String)
    end
  end
  
  def create_split examples, parent_entropy, feature_name
    # BEGIN YOUR CODE
    if not matches? examples, feature_name
      return nil
    end
    split = CategoricalSplit.new(feature_name)
    splits = split.split_on_feature(examples)
    ig = information_gain(parent_entropy, splits)
    #END YOUR CODE
    
    return {"split" => split, "information_gain" => ig}
  end
end

class NumericSplitter
  def matches? examples, feature_name
    has_feature = examples.select {|r| r["features"].has_key? feature_name} 
    return false if has_feature.empty?    
    return has_feature.all? do |r| 
      r["features"].fetch(feature_name, 0.0).is_a?(Numeric)
    end
  end
  
  def create_split examples, parent_entropy, feature_name    
    # BEGIN YOUR CODE
    if not matches? examples, feature_name
      return nil
    end
    all_t = examples.collect {|r| r["features"][feature_name] }.filter { |t| t != nil}.uniq.sort
    sorted_x = examples.sort_by { |r| r["features"][feature_name] or 0}
    best_val = 0
    ig_max = 0

    left = Hash.new(0.0)
    right = Hash.new(0.0)
    examples.each do |row|
      right[row["label"]] += 1
    end

    nleft = 0.0
    nright = examples.size
    n = examples.size
    def get_class_distribution c, total
      res = Hash.new
      c.each do |k, v|
        if (v / total) != 0
          res[k] = (v / total)
        end
      end

      return res
    end

    all_t.each do |t|
      while sorted_x.first["features"][feature_name] == nil or sorted_x.first["features"][feature_name] < t
        row = sorted_x.shift
        left[row["label"]] += 1.0
        right[row["label"]] -= 1.0
        nleft += 1.0
        nright -= 1.0
      end

      cleft = get_class_distribution(left, nleft)
      cright = get_class_distribution(right, nright)
      ig = parent_entropy - (nleft / n) * entropy(cleft) - (nright / n) * entropy(cright)

      if ig > ig_max
        best_val = t
        ig_max = ig
      end
    end
    ig = ig_max
    split = NumericSplit.new feature_name, best_val
    #END YOUR CODE
    
    return {"split" => split, "information_gain" => ig}
  end
end

class DecisionNode
  attr_reader :children, :examples, :split, :node_entropy, :node_class_distribution
  
  def initialize examples
    @examples = examples
    @node_class_distribution = class_distribution examples    
    @node_entropy = entropy (@node_class_distribution)
    @children = Hash.new
  end
  
  def is_leaf?
    self.children.empty?
  end
      
  def score positive_class_label
    # BEGIN YOUR CODE
    return @node_class_distribution[positive_class_label]
    #END YOUR CODE
  end

  def all_possible_splits feature_names, splitters
    all_splits = []
    
    # BEGIN YOUR CODE
    feature_names.each do |feature_name|
      splitters.each do |spliter|
        if not spliter.matches?(@examples, feature_name)
          next
        end
        res = spliter.create_split(@examples, @node_entropy, feature_name)
        if res["split"] == nil or res["information_gain"] <= 0
          next
        end
        all_splits.append(res)
      end
    end
    #END YOUR CODE
    
    return all_splits
  end

  def split_node! split    
    @split = split
    # BEGIN YOUR CODE
    splits = split.split_on_feature(@examples)
    splits.each do |key, value|
      @children[key] = DecisionNode.new(value)
    end
    #END YOUR CODE
    
    @examples = nil
  end
end

class DecisionTreeLearner
  include DecisionTreeHelper
  include Learner  
  attr_reader :root
  attr_accessor :positive_class_label
  
  def initialize positive_class_label, min_size: 10, max_depth: 50
    @splitters = [CategoricalSplitter.new, NumericSplitter.new]
    @parameters = {"min_size" => min_size, "max_depth" => max_depth}
    @positive_class_label = positive_class_label
  end
    
  def train dataset
    @feature_names = dataset["features"]
    examples = dataset["data"]
    @root = DecisionNode.new examples
    grow_tree @root, @parameters["max_depth"]
  end

  def grow_tree parent, remaining_depth
    # BEGIN YOUR CODE
    if remaining_depth == 1 or parent.examples.size <= @parameters["min_size"]
      return
    end
      
    all_splits = parent.all_possible_splits(@feature_names, @splitters)
    if all_splits.size == 0
      return
    end
      
    best_split = all_splits.max_by {|s| s["information_gain"] }
    parent.split_node!(best_split["split"])
    parent.children.each_value do |node|
      grow_tree(node, remaining_depth - 1)
    end
    #END YOUR CODE
  end

  def predict example
    leaf = find_leaf @root, example
    return leaf.score @positive_class_label
  end

  def evaluate eval_dataset
    examples = eval_dataset["data"]
    examples.map do |example|
      score = predict(example)
      label = example["label"] == @positive_class_label ? 1 : 0
      [score, label]
    end
  end

  def find_leaf node, example
    # BEGIN YOUR CODE
    if node.is_leaf?
      return node
    end
      
    path_name = node.split.test(example)
    if node.children[path_name] == nil
      return node
    end
      
    return find_leaf(node.children[path_name], example)
    #END YOUR CODE
  end
end

class RandomForestLearner
  include Learner  
  attr_reader :trees
  attr_accessor :positive_class_label
  attr_accessor :num_features
  
  def initialize positive_class_label, num_trees: 10, min_size: 10, max_depth: 50, num_features: 3
    @parameters = {"num_trees" => num_trees, "min_size" => min_size, "max_depth" => max_depth}
    @positive_class_label = positive_class_label
    @num_features = num_features
    tree_parameters = @parameters.clone.delete :num_trees
    
    @trees = Array.new(num_trees) do |i| 
      DecisionTreeLearner.new @positive_class_label, min_size: min_size, max_depth: max_depth
    end
  end
  
  def to_s
    JSON.pretty_generate(@trees.collect {|t| t.summarize_node t.root})
  end
  
  def train dataset
    rng = Random.new SEED
    
    # BEGIN YOUR CODE
    @trees.each do |tree|
      random_dataset = random_forest_dataset(dataset, rng, @num_features)
      tree.train(random_dataset)
    end
    #END YOUR CODE
  end

  def evaluate eval_dataset
    examples = eval_dataset["data"]
    examples.map do |example|
      score = predict(example)
      label = example["label"] == @positive_class_label ? 1 : 0
      [score, label]
    end
  end
  
  def predict example
    # BEGIN YOUR CODE
    res = 0.0
    @trees.each do |tree|
      res += tree.predict(example) / @parameters["num_trees"]
    end
    return res
    #END YOUR CODE
  end
end
#END YOUR CODE


In [None]:
class ClassifierFive
  include FinalProjectClassifier
  
  def divide numerator, denominator
    if numerator.to_i.zero? or denominator.to_i.zero?
      return nil
    end
    
    if numerator.to_i.zero? or denominator.to_i.zero?
      return nil
    end

    return numerator.abs() / denominator.to_f
  end
  
  def multiply a, b
    if a.to_i.zero? or b.to_i.zero?
      return nil
    end
    
    return a.to_f * b.to_f
  end
  
  def create_training_dataset training_db
    
    features_1 = "application_train.SK_ID_CURR, application_train.TARGET, "
    features_1 += "application_train.EXT_SOURCE_1, application_train.EXT_SOURCE_2, application_train.EXT_SOURCE_3, application_train.AMT_CREDIT, application_train.AMT_ANNUITY, application_train.DAYS_BIRTH, application_train.DAYS_EMPLOYED, application_train.AMT_INCOME_TOTAL, "
    features_1 += "t1.AMT_PAYMENT_MIN_SUM"
    
    sql_1 = "SELECT t.SK_ID_CURR, SUM(t.AMT_PAYMENT_MIN) as AMT_PAYMENT_MIN_SUM FROM "
    sql_1 += "(SELECT SK_ID_CURR, SK_ID_PREV, MIN(AMT_PAYMENT) as AMT_PAYMENT_MIN FROM installments_payments GROUP BY SK_ID_PREV ORDER BY SK_ID_CURR, SK_ID_PREV) t "
    sql_1 += "GROUP BY SK_ID_CURR ORDER BY SK_ID_CURR"
    
    sql = "SELECT %s FROM application_train LEFT JOIN (%s) t1 ON application_train.SK_ID_CURR = t1.SK_ID_CURR" % [features_1, sql_1]
    
    dataset = create_dataset training_db, sql
    dataset["data"].map {|x| x["bias"] = 1.0}
    
    dataset["features"] += ["credit_term", "annuity_income_percent", "credit_income_percent", "days_employed_percent"]
    
    day_max = 365243 # outlier
    
    for i in 0..(dataset["data"].size - 1)
      example_features = dataset["data"][i]["features"]
      dataset["data"][i]["features"]["credit_term"] = divide(example_features["amt_annuity"], example_features["amt_credit"])
      dataset["data"][i]["features"]["annuity_income_percent"] = divide(example_features["amt_annuity"], example_features["amt_income_total"])
      dataset["data"][i]["features"]["credit_income_percent"] = divide(example_features["amt_credit"], example_features["amt_income_total"])
      
      dataset["data"][i]["features"]["days_birth"] = divide(example_features["days_birth"], 365)
      
      if example_features["days_employed"] == day_max
        example_features["days_employed"] = nil
      end
      dataset["data"][i]["features"]["days_employed"] = divide(example_features["days_employed"], 365)
      dataset["data"][i]["features"]["days_employed_percent"] = divide(example_features["days_employed"], example_features["days_birth"])
      
    end

    return dataset
  end 
  
  
  def create_evaluation_dataset evaluation_db
    return create_training_dataset evaluation_db
  end
  
  def create_learners dataset
    dt = RandomForestLearner.new 1, num_trees: 120, min_size: 100, max_depth: 10, num_features: 4
    dataset["data"] = dataset["data"][0, 35000]

    ds = DownsampleNegatives.new(0.25)
    dataset["data"] = ds.apply(dataset["data"])
    
    transformer = FeatureTransformPipeline.new(
      
      MeanImputation.new(%w(ext_source_1 ext_source_2 ext_source_3 days_birth days_employed amt_credit amt_annuity credit_term amt_payment_min_sum amt_income_total annuity_income_percent credit_income_percent days_employed_percent)) 
    )
  
    learner = CopyingTransformingLearner.new(transformer, dt)  
    learner.name = "Random_Forest"
    learners = [learner]
    
    return learners
  end
  
end


In [None]:
def test_86e342(test_data)
  test_basics test_data
end

test_data_86e342 = {classifier: ClassifierFive.new, min_auc: 0.75, max_auc: 1.0, folds: 5}
test_86e342(test_data_86e342)

In [None]:
def test_7ac09d(test_data)
  run_cross_validation_performance test_data
  test_data[:cross_validation_results]
end
test_7ac09d(test_data_86e342)

In [None]:
def test_994a073(test_data)
  test_cross_validation_performance test_data
end
test_994a073(test_data_86e342)

In [None]:
def test_f77e97(test_data)
  test_data[:db] = dev_db()
  test_data[:db_size] = 15334
  test_data[:name] = "dev_eval"
  test_evaluation_set_performance test_data
end
test_f77e97(test_data_86e342)