In [2]:
require 'open-uri'
require 'json'
require 'daru'
require 'distribution'
require 'sqlite3'
require 'rbplotly'
require './assignment'

include Assignment

Object

In [3]:
dir = "#{ENV['HOME']}/cs6140/final_project"
db = SQLite3::Database.new "#{dir}/credit_risk_data.db"
db.results_as_hash = true

true

## Step 1. Data Analysis

## Step 1.1 Target 0 vs 1

In [4]:
x = []
y = []
db.execute "select target, count(*) from application_train group by target" do |row|
  x << row[0].to_s
  y << row[1]
end

def plot_top_10 db, column
  x = []
  y = []
  s = "select " + column + ", count(*) from application_train group by " + column + " order by count(*) desc limit 10"
  db.execute(s) do |row|
    x << (row[0].to_s.empty? ? "No Data" : row[0].to_s)
    y << row[1]
  end
  
  df =  Daru::DataFrame.new({x: x, y: y})
  df.to_category :x
  df.plot(type: :bar, x: :x, y: :y) do |plot, diagram|
    plot.x_label column
    plot.y_label "Frequency"
  end
end

:plot_top_10

In [5]:
plot_top_10 db, "target"

## 1.2 Best 20 features sorted by information gain

In [5]:
def pick_feature_all db, table, feature
  query = "select " + feature + ", Target label from " + table + " where " + feature + " IS NOT NULL and " + feature +
    " != '' order by " + feature + " asc limit 10000" 
  data = db.execute(query)
  return data
end

def pick_feature_non_null db, table, feature
  query = "select count(*) from " + table + " where " + feature + " IS NOT NULL and " + feature +
    " != ''" 
  data = db.execute(query)
  return data
end

:pick_feature_non_null

In [6]:
def ig_analyze db, table
  result = []
  cols = db.execute("pragma table_info(" + table + ")").select do |c|
    c[1].upcase  != "TARGET"
  end
  
  data_total_count = db.execute("select count(*) from application_train").length

  cols.each do |col|
    col_name = col["name"]
    col_type = col["type"]
    next if col_name.include? "_ID"
    
    data = pick_feature_all db, table, col_name
    data_non_null = pick_feature_non_null db, table, col_name
    frequency = data_non_null[0].length / data_total_count
    max_ig, best_v = best_ig data, col_name, col_type
    
    result << {"feature" => col_name, "type" => col_type, "max_ig" => max_ig.round(6), "frequency" => frequency.round(6)}
  end
  
  result = result.sort_by{ |r| r["max_ig"] }.reverse  
  return result
end

:ig_analyze

In [7]:
igs = ig_analyze db, "application_train"
best_20_features = igs[0, 20]
df = Daru::DataFrame.new(
  {"Feature Name" => best_20_features.map{ |r| r["feature"] }, 
    "Feature Type" => best_20_features.map{ |r| r["type"] }, 
    "Information Gain" => best_20_features.map{ |r| r["max_ig"] }, 
    "Frequency" => best_20_features.map{ |r| r["frequency"] }
    })

Unnamed: 0,Feature Name,Feature Type,Information Gain,Frequency
0,EXT_SOURCE_1,NUMERIC,0.006476,2.0
1,EXT_SOURCE_2,NUMERIC,0.005679,2.0
2,EXT_SOURCE_3,NUMERIC,0.003064,2.0
3,ORGANIZATION_TYPE,TEXT,0.002866,2.0
4,DAYS_BIRTH,NUMERIC,0.002174,2.0
5,DAYS_EMPLOYED,NUMERIC,0.001852,2.0
6,REGION_POPULATION_RELATIVE,NUMERIC,0.001676,2.0
7,DAYS_REGISTRATION,NUMERIC,0.001131,2.0
8,HOUR_APPR_PROCESS_START,INTEGER,0.000998,2.0
9,AMT_CREDIT,NUMERIC,0.000903,2.0


### 1.3 Feature aggregation and encoding by k-mean clustering

### 1.3.1 Generate a feature space with DAYS_BIRTH and DAYS_EMPLOYED

In [8]:
def plot_clusters data, features
  x1 = []
  x2 = []
  target = []
  data.each do |row|
    x1 << row["features"][features[0]]
    x2 << row["features"][features[1]]
    target << row["cluster"]
  end
  df = Daru::DataFrame.new({x1: x1, x2: x2, target: target})
  df.to_category :target
  df.plot(type: :scatter, x: :x1, y: :x2, categorized: {by: :target, method: :color}) do |plot, diagram|
    plot.x_label features[0]
    plot.y_label features[1]
  end
end

def plot_clusters_labels data, features
  x1 = []
  x2 = []
  target = []
  data.each do |row|
    x1 << row["features"][features[0]]
    x2 << row["features"][features[1]]
    target << row["label"]
  end
  df = Daru::DataFrame.new({x1: x1, x2: x2, target: target})
  df.to_category :target
  df.plot(type: :scatter, x: :x1, y: :x2, categorized: {by: :target, method: :color}) do |plot, diagram|
    plot.x_label features[0]
    plot.y_label features[1]
  end
end

:plot_clusters_labels

In [9]:
features = ["AMT_CREDIT", "DAYS_BIRTH"]

def fetch_log_data db, features, limit = false
  table = "application_train"
  query = "select " + features.join(",") + ", Target label from " + table
  query += " limit 10000" if limit
  data = []
  db.execute(query).each do |r|
    row = Hash.new
    row["features"] = Hash.new
    features.each do |f|
      row["features"][f] = Math.log((r[f]).abs)
    end
    row["label"] = r["label"]
    data << row
  end
  return data
end
1

1

In [10]:
data = fetch_log_data db, features, true
puts data.length

10000


In [11]:
# generate k centers for later clustering
def init_cluster data, k, features
  means = Hash.new {|h,k| h[k] = Hash.new {|h,k| h[k] = 0.0}}
  k.times do |i|
    features.each do |j|
      min, max = data.collect {|r| r["features"][j]}.minmax
      means[i][j] = min + rand * (max - min)
    end
  end
  return means
end

means = init_cluster data, 5, features
puts means

{0=>{"AMT_CREDIT"=>11.429920354437092, "DAYS_BIRTH"=>9.849856852947779}, 1=>{"AMT_CREDIT"=>14.44892104471365, "DAYS_BIRTH"=>9.503790221428924}, 2=>{"AMT_CREDIT"=>11.738292118137927, "DAYS_BIRTH"=>9.385069462515526}, 3=>{"AMT_CREDIT"=>13.807479017724837, "DAYS_BIRTH"=>10.126214200551214}, 4=>{"AMT_CREDIT"=>14.516429643350776, "DAYS_BIRTH"=>9.012556973479672}}


In [12]:
def assign_cluster(data, means)
  clusters = []
  data.each do |row|
    distance = [0.0]*means.length
    row["features"].each do |k, v|
      means.each.with_index do |m, i|
        distance[i] += (m[1][k] - v)**2.0
      end
    end
    min = distance.min
    row["cluster"] = distance.each_with_index.min[1]
    clusters << distance.map { |d| d > min ? 0 : 1 }
  end
  return clusters
end

z = assign_cluster(data, means)
1

1

In [13]:
plot_clusters data[0, 1000], features

In [14]:
def calculate_means z, data
  k = z.first.size
  means = Hash.new {|h,k| h[k] = Hash.new {|h,k| h[k] = 0.0}}
  total = Hash.new {|h,k| h[k] = 0.0}
  
  (0..k-1).each do |i|
    data[0]["features"].each do |k, v|
      means[i][k] = 0.0
    end
  end
  
  data.each do |row|
    row["features"].each do |k, v|
      means[row["cluster"]][k] += v
    end
    total[row["cluster"]] += 1
  end
  means.each{|k,v| v.each {|m, n| means[k][m] = n/total[k] unless total[k].zero? }}
  return means.sort.to_h
end

puts calculate_means(z, data)

{0=>{"AMT_CREDIT"=>11.624824707157149, "DAYS_BIRTH"=>9.897738916834477}, 1=>{"AMT_CREDIT"=>14.088586392299618, "DAYS_BIRTH"=>9.529418924938147}, 2=>{"AMT_CREDIT"=>12.375710289777295, "DAYS_BIRTH"=>9.53997832412168}, 3=>{"AMT_CREDIT"=>13.409463358172019, "DAYS_BIRTH"=>9.711899708625285}, 4=>{"AMT_CREDIT"=>13.801828725921416, "DAYS_BIRTH"=>9.08882984368788}}


In [15]:
def cluster_dist(m1, m2)
  sum = 0.0
  m1.each_key do |i|
    sum += m1[i].keys.reduce(0.0) do |u, k|
      u += (m1[i][k] - m2[i][k])**2
    end
  end
  return sum / m1.size
end

def k_means data, k, tol, features
  last_means = init_cluster data, k, features
  z = assign_cluster data, last_means
  
  dists = []
  1000.times do |n|
    means = calculate_means z, data
    dist = cluster_dist last_means, means
    break if dist <= tol
    last_means = means
    z = assign_cluster data, last_means
    dists << dist
  end
    
  return [dists, last_means]
end

:k_means

In [16]:
k = (Math.log data.length).floor
puts k
dists, means, z = k_means data, k, 1e-5, features
iters = Array.new(dists.size) {|i| i }

iters.shift
dists.shift

df = Daru::DataFrame.new({iters: iters, dists: dists})
df.plot(type: :line, x: :iters, y: :dists) do |plot, diagram|
  plot.x_label "X"
  plot.y_label "Mean Dist"
  diagram.title "Cluster Convergence"
  plot.legend false
end

9


In [17]:
plot_clusters data[0, 1000], features

In [18]:
plot_clusters_labels data[0, 1000], features

In [19]:
d = data.select{ |x| x["cluster"] == 4 }
puts d.select{ |x| x["label"] == 0 }.length
puts d.select{ |x| x["label"] == 1 }.length
d.length

618
27


645

In [20]:
clusters = []
(0..k-1).each do |i|
  d = data.select{ |x| x["cluster"] == i }
  good = d.select{ |x| x["label"] == 0 }.length
  bad = d.select{ |x| x["label"] == 1 }.length
  total = (good + bad) > 0 ? (good + bad) : 1
  clusters << { "cluster" => i, "ratio" => bad.to_f / total}
end

puts clusters.sort_by{ |c| c["ratio"] }.reverse

[{"cluster"=>7, "ratio"=>0.133630289532294}, {"cluster"=>1, "ratio"=>0.1086048454469507}, {"cluster"=>2, "ratio"=>0.08599033816425121}, {"cluster"=>6, "ratio"=>0.07435508345978756}, {"cluster"=>8, "ratio"=>0.07034082668600435}, {"cluster"=>0, "ratio"=>0.07}, {"cluster"=>5, "ratio"=>0.054637865311308764}, {"cluster"=>4, "ratio"=>0.04186046511627907}, {"cluster"=>3, "ratio"=>0.03900414937759336}]


In [21]:
clusters[0]

{"cluster"=>0, "ratio"=>0.07}

## Step 2. Methods

## Step 2.0 Data Preparation

In [22]:
def fetch_data db, features, limit = false
  table = "application_train"
  query = "select " + features.join(",") + ", Target label from " + table
  query += " limit 10000" if limit
  data = []
  db.execute(query).each do |r|
    row = Hash.new
    row["features"] = Hash.new
    features.each do |f|
      row["features"][f] = r[f]
    end
    row["label"] = r["label"]
    data << row
  end
  return data
end

def fetch_data_11 db, features, limit = false
  table = "application_train"
  query = "select SK_ID_CURR, " + features.join(",") + ", (AMT_CREDIT/AMT_ANNUITY) AMT_CREDIT_TO_ANNUITY_RATIO, Target label from " + table
  query += " limit 10000" if limit
  data = []
  db.execute(query).each do |r|
    row = Hash.new
    row["features"] = Hash.new
    (features + ["SK_ID_CURR", "AMT_CREDIT_TO_ANNUITY_RATIO"]).each do |f|
      row["features"][f] = r[f]
    end
    row["label"] = r["label"]
    data << row
  end
  return data
end

def fetch_data_fuzzy db
  table = "application_train"
  query = "select EXT_SOURCE_1, EXT_SOURCE_2, EXT_SOURCE_3, DAYS_BIRTH, (AMT_CREDIT/AMT_ANNUITY) as AMT_RATIO, 
Target label from " + table + " limit 11000"
  data = []
  db.execute(query).each do |r|
    row = Hash.new
    row["features"] = Hash.new
    ["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3", "AMT_RATIO", "DAYS_BIRTH"].each do |f|
      row["features"][f] = r[f]
    end
    row["label"] = r["label"]
    data << row
  end
  return data
end

:fetch_data_fuzzy

In [23]:
# encode categorical features to numerical values
def encode data, features
  data_new = []
  values_map = Hash.new
  
  features.each do |f|
    values_map[f] = data.collect { |x| x["features"][f] }.uniq
  end
  
  data.each do |x|
    row = Hash.new
    row["label"] = x["label"]
    row["features"] = Hash.new
    features.each do |f|
      values = values_map[f]
      values.each do |v|
        row["features"][f.to_s + "_" +v.to_s] = (x["features"][f] == v ? 1.0 : 0.0)
      end
    end
    data_new << row
  end
  return data_new
end

:encode

In [24]:
features_logistic = ["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]
data_logistic = fetch_data db, features_logistic
data_logistic_test = data_logistic[0, data_logistic.length/10]
data_logistic_train = data_logistic[data_logistic.length/10, data_logistic.length]
1

1

In [25]:
features_categorical = igs.select{ |f| f["type"] == "TEXT"  }.map { |f| f["feature"] }
data_categorical = fetch_data db, features_categorical, true
data_categorical_encoded = encode data_categorical, features_categorical
data_categorical_test = data_categorical_encoded[0, data_categorical.length/10]
data_categorical_train = data_categorical_encoded[data_categorical.length/10, data_categorical.length]
1

1

In [26]:
features_best_10 = best_20_features.select{ |f| f["type"] != "TEXT"  }[0, 10].map { |f| f["feature"] }
features_best_11_info = (best_20_features.select{ |f| f["type"] != "TEXT"  }[0, 10]) + [{"feature"=>"AMT_CREDIT_TO_ANNUITY_RATIO", "type"=>"NUMERIC" }]
data_best_11 = (fetch_data_11 db, features_best_10, true).shuffle
data_best_11_test = data_best_11[0, data_best_11.length/10]
data_best_11_train = data_best_11[data_best_11.length/10, data_best_11.length]
1

1

In [27]:
data_best_11[0]

{"features"=>{"EXT_SOURCE_1"=>"", "EXT_SOURCE_2"=>0.35856849883534514, "EXT_SOURCE_3"=>0.41534714488434, "DAYS_BIRTH"=>-22229, "DAYS_EMPLOYED"=>365243, "REGION_POPULATION_RELATIVE"=>0.035792000000000004, "DAYS_REGISTRATION"=>-7851, "HOUR_APPR_PROCESS_START"=>12, "AMT_CREDIT"=>1046142, "LIVINGAREA_MEDI"=>"", "SK_ID_CURR"=>110404, "AMT_CREDIT_TO_ANNUITY_RATIO"=>34}, "label"=>0}

In [28]:
  data_fuzzy = (fetch_data_fuzzy db).shuffle
  data_fuzzy_test = data_fuzzy[0, data_fuzzy.length/10]
  data_fuzzy_train = data_fuzzy[0, data_fuzzy.length]
1

1

## Step 2.1 Baseline methods

In [29]:
class StochasticGradientDescent
  attr_reader :weights
  attr_reader :model
  attr_reader :eta_z
  
  def initialize model, w, eta_z = 0.01
    @weights = w
    @model = model
    @eta_z = eta_z
  end
  
  # implemented with tolerence
  def update data, b_size, loss_tol = 0.002, max_iter = 30
    eta = @eta_z
    w = @weights
    
    iters = []
    rmses = []
    rmses_cum = []
    norms = []
    rm_prev = 0.0
    diff = 100.0
    
    (1..max_iter).each do |j|
      r = data.shuffle.each_slice(b_size).to_a
      (0..r.length-1).each do |i|
        dw = @model.grad r[i], w
        w = update_weights w, dw, eta
        @model.adjust w
        rm = model.func r[i], w
        iter = i + 1 + (j-1)*r.length
        
        rmses << rm/r[i].length
        rmses_cum << rmses.sum/rmses.length
        iters << iter
        norms << norm(w)

        diff = rm_prev - rmses_cum.last
        rm_prev = rmses_cum.last
        
        eta = @eta_z/Math.sqrt(iter)
      end
      if j > 1
        break if diff.abs <= loss_tol
      end
    end
    
    return [iters, rmses, rmses_cum, norms, w, data]
  end
end

:update

## 2.1.1 Logistic Regression on EXT_*

In [30]:
class LogisticRegressionModel
  def func dataset, w
    e = Math::E
    u = 0.0
    dataset.each do |row|
      y = row["label"]
      z = dot row, w
      u += Math.log(1.0 + e**(-z)) + z * (1.0 - y)
    end
    return u
  end
  
  def grad dataset, w
    e = Math::E
    g = Hash.new {|h,k| h[k] = 0.0}
    dataset.each do |row|
      y = row["label"]
      z = dot row, w
      row["features"].each do |k, v|
        next if v == "" or v.nil?
        g[k] += v * (1.0/(1.0 + e**(-z)) - y)
      end
    end
    return g
  end
  
  # For prediction, simply calculate the logistic function
  def predict row, w
    e = Math::E
    z = dot row, w
    y = (1.0/(1.0 + e**(-z))) > 0.5 ? 1 : 0
    return { y => (1.0/(1.0 + e**(-z)))}  
  end
  
  ## Adjusts the parameter to be within the allowable range
  def adjust w
  end
end

:adjust

In [31]:
model = LogisticRegressionModel.new

# Initialize SGD
w = Hash.new {|h,k| h[k] = 1}
s = StochasticGradientDescent.new model, w, 0.1
iters, lik, lik_cum, norms, w, data = s.update data_logistic_train, 1000

# lik is based on the average func
plot(iters, lik)

In [32]:
predictions = data_logistic_test.collect {|row| model.predict row, w}
scores = predictions.collect{ |k| k.values[0] }.uniq.sort.reverse
scores_limited = scores.select.with_index {|x, i| i % 100 == 0}
1

1

In [33]:
fprs, tprs = roc_curve data_logistic_test, predictions, scores_limited
plot(fprs, tprs)

In [34]:
auc fprs, tprs 

0.693136767428845

## 2.1.2 Mutlinomial Naive Bayes Classifier on all categorical features

In [35]:
class NaiveBayesModel
  # Negative log likelihood
  def func dataset, w
    # suppose p_ck follows multinomial, ignore the constant gamma header for multi-combination coefficient
    u = 0.0
    dataset.each do |r|
      s = r["features"]
      k = r["label"]
      u -= Math.log(w[k])
      s.keys.each do |j|
        p_jk = j.to_s + "_" + k.to_s
        u -= s[j] * Math.log(w[p_jk])
      end
    end
    return u
  end
  
  def grad dataset, w
    g = Hash.new {|h,k| h[k] = 0.0}
    
    dataset.each do |r|
      s = r["features"]
      k = r["label"]
      g[k] -= 1 / w[k]
      s.keys.each do |j|
        p_jk = j.to_s + "_" + k.to_s
        g[p_jk] -= s[j] / w[p_jk]
      end
    end
    
    return g
  end
  
  def predict row, w
    s = Hash.new {|h,k| h[k] = 0.0}

    ["0", "1"].each do |k|
      s[k] += Math.log(w[k])
      row["features"].keys.each do |j|
        p_jk = j.to_s + "_" + k.to_s
        s[k] += row["features"][j] * Math.log(w[p_jk])
      end
    end
    max = s.values.max
    return { s.key(max) => max }
  end
  
  def adjust w
    w.each_key do |fname|
      w[fname] = [[0.001, w[fname]].max, 0.999].min
    end
  end
end

:adjust

In [36]:
model = NaiveBayesModel.new

# Initialize SGD
w = Hash.new {|h,k| h[k] = 0.5}
s = StochasticGradientDescent.new model, w, 1e-3
iters, lik, lik_cum, norms, w, data = s.update data_categorical_train, 1000

# lik is based on the average func
plot(iters, lik)

In [37]:
predictions = data_categorical_test.collect {|row| model.predict row, w}
scores = predictions.collect{ |k| k.values[0] }.uniq.sort.reverse
scores_limited = scores.select.with_index {|x, i| i % 1 == 0}
1

1

In [38]:
fprs, tprs = roc_curve data_categorical_test, predictions, scores_limited
plot(fprs, tprs)

In [39]:
auc fprs, tprs 

0.485706605222734

## 2.1.3 Dummy Classifier predicts 1 always

In [40]:
class SimpleOneModel
  def train data
  end
  
  def predict row
    return { 1 => 1 }
  end
end

:predict

In [41]:
model = SimpleOneModel.new
model.train data_logistic_train
predictions = data_logistic_test.collect {|row| model.predict row}
scores = predictions.collect{ |k| k.values[0] }.uniq.sort.reverse
scores_limited = scores

[1]

In [42]:
fprs, tprs = roc_curve data_logistic_test, predictions, scores_limited
plot(fprs, tprs)

In [43]:
auc fprs, tprs 

0.5

## 2.1.4 Random Classifier predicts label randomly based on class distribution

In [44]:
class SimpleRatioModel
  @mu
  def train data
    @mu = data.map { |x| x["label"]==1 ? 1 : 0 }.sum/data.length.to_f
  end
  
  def predict row
    random = rand(10000)
    vote = random <= @mu*10000 ? 1 : 0
    return { vote => random.to_f/10000 }
  end
end

:predict

In [45]:
model = SimpleRatioModel.new
model.train data_logistic_train
predictions = data_logistic_test.collect {|row| model.predict row}
scores = predictions.collect{ |k| k.values[0] }.uniq.sort.reverse
scores_limited = scores.select.with_index {|x, i| i % 100 == 0}
1

1

In [46]:
fprs, tprs = roc_curve data_logistic_test, predictions, scores_limited
plot(fprs, tprs)

## 2.1.5 AdaBoost Decision Stamp

In [47]:
class RandomSplitWeakClassifier
  @splits = []
  @f_name = ""
  @f_type = ""
  @split_v = 0.0
  
  def initialize
  end
  
  def train data, f_name, f_type    
    groups = data.group_by{|h| h["features"][f_name] }
    # Use middle groups to provide better results
    i = 0
    v = 0
    index = 0
    if f_type == "TEXT"
      index = rand(groups.keys.length)
    else
      index = 0.25 * groups.keys.length + 0.5 * rand(groups.keys.length)
    end
    
    groups.each do |p|
      v = p[0]
      i += 1
      break if i>index
    end
    
    @f_name = f_name
    @f_type = f_type
    @split_v = v
  end
  
  def predict data
    left_votes = 1
    right_votes = 0
    errors = {}
    
    # Assume data is sorted in asc
    if @f_type == "TEXT"
      data.each.with_index do |x, i|
        votes = x["features"][@f_name] == @split_v ? left_votes : right_votes
        errors[x["features"]["SK_ID_CURR"]] = x["label"] == votes ? 0.0 : 1.0
      end
    else
      data.each.with_index do |x, i|
        votes = x["features"][@f_name] != "" && x["features"][@f_name] <= @split_v ? left_votes : right_votes
        errors[x["features"]["SK_ID_CURR"]] = x["label"] == votes ? 0.0 : 1.0
      end
    end
    
    return errors
  end
  
  def predict_outputs data
    left_votes = 1
    right_votes = 0
    outputs = {}
    
    # Assume data is sorted in asc
    if @f_type == "TEXT"
      data.each.with_index do |x, i|
        votes = x["features"][@f_name] == @split_v ? left_votes : right_votes
        outputs[x["features"]["SK_ID_CURR"]] = votes == 1 ? 1.0 : -1.0
      end
    else
      data.each.with_index do |x, i|
        votes = x["features"][@f_name] != "" && x["features"][@f_name].to_f <= @split_v.to_f ? left_votes : right_votes
        outputs[x["features"]["SK_ID_CURR"]] = votes == 1 ? 1.0 : -1.0
      end
    end
    return outputs
  end
end

:predict_outputs

In [48]:
def eplison_from_error errors, w
  return errors.keys.inject(0.0) do |u, k|
    u += errors[k] * w[k]
  end
end

def update_weights w, beta, ys, outputs
  wt = Hash.new
  ys.keys.each do |k|
    wt[k] = w[k] * Math.exp(-beta * ys[k] * outputs[k])
  end
  zt = 1.0 / wt.values.sum
  wt.each { |k, v| wt[k] = v*zt }
  return wt
end

def wy_init db, data
  w = Hash.new
  ys = Hash.new
  data.each do |row|
    ys[row["features"]["SK_ID_CURR"]] = row["label"] == 1 ? 1.0 : -1.0
    w[row["features"]["SK_ID_CURR"]] = 1.0 / 10000
  end
  return w, ys
end

:wy_init

In [49]:
et = 0.0
et_prev = 0.0
et_cum = 0.0
max_iter = 20
loss_tol = 1e-4
iters = []
lik = []
weak_cls = []
outputs = []
betas = []
datas = []

features = features_best_11_info
data_total_count = 10000
w, ys = wy_init db, data_best_11
  
(1..max_iter).each do |i|
  # randomly pick a feature
  f = features[rand features.length]
  f_name = f["feature"]
  f_type = f["type"]
  
  classifier = RandomSplitWeakClassifier.new
  data = data_best_11
  datas << data
  classifier.train data, f_name, f_type
  weak_cls << classifier
  
  errors = classifier.predict data
  outputs = classifier.predict_outputs data

  # calculate the actual loss and beta
  et = eplison_from_error errors, w
  beta = Math.log((1-et).to_f/et)
  betas << beta
  w = update_weights w, beta, ys, outputs
  
  puts et
  et_cum += et
  iters << i
  lik << et_cum / i
  if i > 1
    break if (lik.last - lik[lik.length-2]).abs <= loss_tol
    et_prev = et
  end
end
1

0.19679999999999465
0.337569359958536
0.48392375503839563
0.49395178911934606
0.47973824908737367
0.7665845058374015
0.44589604621252643
0.6106663166520602
0.4445528310356197
0.4985411083227288
0.52001768900257
0.4853849368944955
0.5087376303578641
0.5406797096196473
0.5210713585805204
0.6200655996155899
0.525522344879113
0.375519602796585
0.39718942643902067
0.46543127618842517


1

In [50]:
# now it's the time to do actual predictions
predictions = []

(0..data_best_11.length-1).each do |j|
  total = 0.0
  weak_cls.each.with_index do |model, i|
    p = model.predict_outputs [datas[i][j]]
    total += p.values[0] * betas[i]
  end
  predictions << { "something" => total }
end

scores = predictions.collect{ |k| k.values[0] }.uniq.sort.reverse
scores_limited = scores.select.with_index {|x, i| i % 10 == 0}
1

1

In [51]:
predictions.length

10000

In [52]:
fprs, tprs = roc_curve data_best_11, predictions, scores_limited
puts fprs[0, 10]
puts tprs[0, 10]
plot(fprs, tprs)

[0.0, 0.0, 0.000975609756097561, 0.002818428184281843, 0.004336043360433604, 0.005420054200542005, 0.006720867208672087, 0.008672086720867209, 0.01084010840108401, 0.013333333333333334]
[0.0, 0.0, 0.003870967741935484, 0.005161290322580645, 0.005161290322580645, 0.005161290322580645, 0.0064516129032258064, 0.007741935483870968, 0.01032258064516129, 0.01806451612903226]


In [53]:
auc fprs, tprs

0.5460943788792727

## 2.1.6 AdaBoost Decision Tree

In [None]:
class DecisionWeakClassifier
  @splits = []
  @f_name = ""
  @f_type = ""
  @split_v = 0.0
  
  def initialize
  end
  
  def train data, f_name, f_type    
    groups = data.group_by{|h| h["features"][f_name] }
    # Use middle groups to provide better results
    i = 0
    v = 0
    index = 0
    if f_type == "TEXT"
      index = rand(groups.keys.length)
    else
      index = 0.25 * groups.keys.length + 0.5 * rand(groups.keys.length)
    end
    
    groups.each do |p|
      v = p[0]
      i += 1
      break if i>index
    end
    
    @f_name = f_name
    @f_type = f_type
    @split_v = v
  end
  
  def predict data
    left_votes = 1
    right_votes = 0
    errors = {}
    
    # Assume data is sorted in asc
    if @f_type == "TEXT"
      data.each.with_index do |x, i|
        votes = x["features"][@f_name] == @split_v ? left_votes : right_votes
        errors[x["features"]["SK_ID_CURR"]] = x["label"] == votes ? 0.0 : 1.0
      end
    else
      data.each.with_index do |x, i|
        votes = x["features"][@f_name] != "" && x["features"][@f_name] <= @split_v ? left_votes : right_votes
        errors[x["features"]["SK_ID_CURR"]] = x["label"] == votes ? 0.0 : 1.0
      end
    end
    
    return errors
  end
  
  def predict_outputs data
    left_votes = 1
    right_votes = 0
    outputs = {}
    
    # Assume data is sorted in asc
    if @f_type == "TEXT"
      data.each.with_index do |x, i|
        votes = x["features"][@f_name] == @split_v ? left_votes : right_votes
        outputs[x["features"]["SK_ID_CURR"]] = votes == 1 ? 1.0 : -1.0
      end
    else
      data.each.with_index do |x, i|
        votes = x["features"][@f_name] != "" && x["features"][@f_name].to_f <= @split_v.to_f ? left_votes : right_votes
        outputs[x["features"]["SK_ID_CURR"]] = votes == 1 ? 1.0 : -1.0
      end
    end
    return outputs
  end
end

## 2.1.7 IVFS ARC

In [54]:
# return the minimum support in order to generate a rule
def min_supp data, label, minsups
  x_class = parse_class data
  return minsups[label] * x_class[label].to_f / x_class.values.sum
end

:min_supp

In [55]:
# calculate the minimum support
minsups = [0.1, 0.12]
min_supps = Hash.new
min_supps[0] = min_supp data, 0, minsups
min_supps[1] = min_supp data, 1, minsups
puts min_supps

{0=>0.09225, 1=>0.0093}


In [56]:
# Generate the n-label linguistic fuzzy sets based on all possible features
def feature_fuzzy_sets data, n
  fs = []
  h = data[0]["features"].keys
  h.each do |f|
    scores = data.collect {|x| x["features"][f]}.select{ |x| x && x!= "" }.
      sort.uniq
    # 5-label fuzzy set
    (0..(n-1)).each do |i|
      max = scores.max
      min = scores.min
      # since min is 0 and max is 1, there will be only 4 gaps
      base = (max - min).to_f / (n-1)
      r = Hash.new
      r["feature"] = f
      r["set"] = i
      r["mid"] = min + base * i
      r["base"] = base * 2
      fs << r
    end
  end
  return fs
end

:feature_fuzzy_sets

In [57]:
def support data, antes, cl, mu_table
  u = 0.0
  data.each do |x|
    next if x["label"] != cl
    mus = []
    antes.each do |ante|
      v = x["features"][ante["feature"]]
      next if v.nil? or v == ""
      header = ante["feature"].to_s + "_" + ante["set"].to_s
      mu = mu_table[header][v]
      mus << mu
      # skip some iterations if we found one mu as 0.
      break if mu == 0
    end
    u += t_norm mus if mus.length > 0
  end
  return u/data.length
end

def confidence data, antes, cl, mu_table
  u = 0.0
  d = 0.0
  data.each do |x|
    mus = []
    antes.each do |ante|
      v = x["features"][ante["feature"]]
      next if v.nil? or v == ""
      header = ante["feature"].to_s + "_" + ante["set"].to_s
      mu = mu_table[header][v]
      mus << mu
      # skip some iterations if we found one mu as 0.
      break if mu == 0
    end
    u += t_norm mus if x["label"] == cl && mus.length > 0
    d += t_norm mus if mus.length > 0
  end
  return u/d
end

def mu x, ante, w, negative_enabled = false
  f = ante["feature"]
  mid = ante["mid"]
  base = ante["base"]
  v = x["features"][f]
  puts "gaga" if v == "" or v.nil?
  mem = 1.0 - ((v - (mid + w * ante["base"])).to_f * 2 / base).abs
  return mem if negative_enabled
  return [0.0, mem].max
end

# Calulate the t-normalization of a vecotr x, use either min or product t-norm
def t_norm x
  return x.min
end

def t_norm x
  return x.reduce(:*)
end

:t_norm

In [58]:
def supp_prune rules, min_supp
  return rules.select { |r| r["support"] > min_supp}
end

def confi_prune rules
  return rules.select { |r| r["confidence"] > 0.0 and r["confidence"] < 1.0 }
end

def generate_leaves data, last_leaves, mu_table, cl
  leaves = []
  last_leaves.each do |rule1|
    f1 = rule1["antecedent"].map { |r| r["feature"] }
    last_leaves.each do |rule2|
      f2 = rule2["antecedent"].map { |r| r["feature"] }
      total = (f1 + f2).uniq.length
      next if total < f1.length + f2.length
      rule = Hash.new
      rule["antecedent"] = (rule1["antecedent"] + rule2["antecedent"])
      rule["support"] = support data, rule["antecedent"], cl, mu_table
      rule["confidence"] = confidence data, rule["antecedent"], cl, mu_table
      rule["class"] = cl
      leaves << rule
    end
  end
  return leaves
end

def generate_mu_table data, ffs, w
  mu_table = Hash.new

  # Initialize the rules
  ffs.each do |fs|
    (0..1).each do |cl|
      header = fs["feature"].to_s + "_" + fs["set"].to_s
      mu_table[header] = Hash.new
      data.each do |x|
        v = x["features"][fs["feature"]]
        mu_table[header][v] = (mu x, fs, w[header]) if !mu_table[header].key?(v) and v != "" and !v.nil?
      end
    end
  end
  return mu_table
end

:generate_mu_table

In [59]:
# Build a search tree based on five-label fuzzy set
def build_search_tree data, min_supps, max_lvl, n, w
  # Generate the first layer of rules, getting all possible fuzzy sets by the 5-label linguistic rule
  last_leaves = Hash.new { |h, k| h[k] = []}
  ffs = feature_fuzzy_sets data, n
  
  puts "Initialize mu table"
  
  mu_table = generate_mu_table data, ffs, w
  
  puts "Mu table initialization complete"
  puts "Generate first lvl of leaves"
  
  ffs.each do |fs|
    (0..1).each do |cl|
      rule = Hash.new
      rule["antecedent"] = [fs]
#       rule["support"], rule["confidence"], = support_confidence data, rule["antecedent"], cl, mu_table
      rule["support"] = support data, rule["antecedent"], cl, mu_table
      rule["confidence"] = confidence data, rule["antecedent"], cl, mu_table
      rule["class"] = cl
      last_leaves[cl] << rule
    end
  end
  
  rules = Hash.new { |h, k| h[k] = []}
 
  (0..1).each do |cl|
    curr_lvl = 0
    print "Genearte search tree for label ", cl, "\n"
    while true
      curr_lvl += 1
      print "\t lvl ", curr_lvl, "\n"
      last_leaves[cl] = (supp_prune last_leaves[cl], min_supps[cl])
      rules[cl] += last_leaves[cl]
      break if curr_lvl >= max_lvl || last_leaves[cl].length <= 2
      last_leaves[cl] = (confi_prune last_leaves[cl])
      last_leaves[cl] = generate_leaves data, last_leaves[cl], mu_table, cl
    end
  end
  
  return rules
end

:build_search_tree

In [60]:
w0 = Hash.new { |h,k| h[k] = 0.0 }
w = w0
rb = []
# max lvl of the tree
max_lvl = 3

# print the parameters
puts min_supps, max_lvl

rb1 = []
rb1 = build_search_tree data_fuzzy_train, min_supps, 5, 5, w
1

{0=>0.09225, 1=>0.0093}
3
Initialize mu table
Mu table initialization complete
Generate first lvl of leaves
Genearte search tree for label 0
	 lvl 1
	 lvl 2
	 lvl 3
Genearte search tree for label 1
	 lvl 1
	 lvl 2
	 lvl 3


1

In [61]:
# Combine two rule bases together
rbs = rb1[0] + rb1[1]
1

1

In [62]:
# initialize the mu table, which will be used heavily later on
ffs = feature_fuzzy_sets data_fuzzy_train, 5
  
mu_table = generate_mu_table data_fuzzy_train, ffs, w
puts "Mu table initialization complete"

Mu table initialization complete


In [63]:
def mu_iv x, ante, w
  f = ante["feature"]
  mid = ante["mid"]
  base = ante["base"]
  v = x["features"][f]
  puts "gaga" if v == ""  
  mem = 1.0 - ((v - (mid + w * ante["base"])).to_f * 2 / base).abs
  mem_up = 1.0 - ((v - (mid + w * ante["base"])).to_f * 2 / (base*1.5)).abs
  return [0.0, mem].max, [0.0, mem_up].max
end

def generate_mu_iv_table data, ffs, w
  mu_table = Hash.new
  # Initialize the rules
  ffs.each do |fs|
    (0..1).each do |cl|
      header = fs["feature"].to_s + "_" + fs["set"].to_s
      mu_table[header] = Hash.new
      data.each do |x|
        v = x["features"][fs["feature"]]
        if !mu_table[header].key?(v) and v != "" and !v.nil?
          mu_table[header][v] = Hash.new         
          mu_table[header][v]["lo"], mu_table[header][v]["hi"] = (mu_iv x, fs, w[header])
        end
      end
    end
  end
  return mu_table
end

:generate_mu_iv_table

In [64]:
ffs = feature_fuzzy_sets data_fuzzy_train, 5
  
puts "Initialize mu table"
mu_iv_table = generate_mu_iv_table data_fuzzy_train, ffs, w0
puts "Mu table initialization complete"

Initialize mu table
Mu table initialization complete


In [65]:
def predict x, rb, mu_table
  votes = Hash.new { |h,k| h[k] = 0.0 }
  (0..1).each do |cl|
    rb.each do |r|
      next if r["class"] != cl
      r["antecedent"].each do |ante|
        v = x["features"][ante["feature"]]
        next if v.nil? or v == ""
        header = ante["feature"].to_s + "_" + ante["set"].to_s
        votes[cl] += r["support"] * r["confidence"] * mu_table[header][v]
      end
    end
  end
  return votes
end

def predict_iv x, rb, mu_table
  votes = Hash.new { |h,k| h[k] = 0.0 }
  (0..1).each do |cl|
    rb.each do |r|
      next if r["class"] != cl
      r["antecedent"].each do |ante|
        v = x["features"][ante["feature"]]
        next if v.nil? or v == ""
        header = ante["feature"].to_s + "_" + ante["set"].to_s
        votes[cl] += r["support"] * r["confidence"] * 
        ( 1 - ( mu_table[header][v]["hi"] - mu_table[header][v]["lo"] ) ) *  mu_table[header][v]["hi"]
      end
    end
  end
  return votes
end

:predict_iv

In [66]:
# now it's the time to do actual predictions
predictions = []
# l = 20

data_fuzzy_test.each.with_index do |x, i|
  total = 0.0
  s = predict_iv x, rbs, mu_iv_table
  s[0] = s[0] * 0.015
  max = [s[0], s[1]].max
  predictions << { s.key(max) => s[0] == 0.0 ? 100.0 : s[1] / s[0] }
end

m = confusion_matrix(%w(0 1), data_fuzzy_test, predictions)
puts m
scores = predictions.collect{ |k| k.values[0] }.uniq.sort.reverse
scores_limited = scores.select.with_index {|x, i| i % 1 == 0}
puts scores[0, 10]
1

{"0"=>{"0"=>1014, "1"=>86}, "1"=>{"0"=>0, "1"=>0}}
[0.891528322121394, 0.8722385195292188, 0.8584171487492415, 0.8413026367303726, 0.8155128680333366, 0.805794965029805, 0.8046200204450492, 0.7704896006984879, 0.7603763852547478, 0.7499110086693351]


1

In [67]:
fprs, tprs = roc_curve data_fuzzy_test, predictions, scores_limited
puts fprs[0, 10]
puts tprs[0, 10]
plot(fprs, tprs)

[0.0, 0.0, 0.0, 0.0009861932938856016, 0.0019723865877712033, 0.0029585798816568047, 0.0029585798816568047, 0.0029585798816568047, 0.0039447731755424065, 0.004930966469428008]
[0.0, 0.0, 0.011627906976744186, 0.011627906976744186, 0.011627906976744186, 0.011627906976744186, 0.023255813953488372, 0.03488372093023256, 0.03488372093023256, 0.03488372093023256]


In [68]:
auc fprs, tprs 

0.7090156414843312

In [None]:
def print_rules
end