In [122]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [123]:
df = pd.read_csv('/content/AER_credit_card_data.csv')

In [124]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   card         1319 non-null   object 
 1   reports      1319 non-null   int64  
 2   age          1319 non-null   float64
 3   income       1319 non-null   float64
 4   share        1319 non-null   float64
 5   expenditure  1319 non-null   float64
 6   owner        1319 non-null   object 
 7   selfemp      1319 non-null   object 
 8   dependents   1319 non-null   int64  
 9   months       1319 non-null   int64  
 10  majorcards   1319 non-null   int64  
 11  active       1319 non-null   int64  
dtypes: float64(4), int64(5), object(3)
memory usage: 123.8+ KB


In [125]:
df.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,yes,0,37.66667,4.52,0.03327,124.9833,yes,no,3,54,1,12
1,yes,0,33.25,2.42,0.005217,9.854167,no,no,3,34,1,13
2,yes,0,33.66667,4.5,0.004156,15.0,yes,no,4,58,1,5
3,yes,0,30.5,2.54,0.065214,137.8692,no,no,0,25,1,7
4,yes,0,32.16667,9.7867,0.067051,546.5033,yes,no,2,64,1,5


In [126]:
df.describe()

Unnamed: 0,reports,age,income,share,expenditure,dependents,months,majorcards,active
count,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0
mean,0.456406,33.213103,3.365376,0.068732,185.057071,0.993935,55.267627,0.817286,6.996967
std,1.345267,10.142783,1.693902,0.094656,272.218917,1.247745,66.271746,0.386579,6.305812
min,0.0,0.166667,0.21,0.000109,0.0,0.0,0.0,0.0,0.0
25%,0.0,25.41667,2.24375,0.002316,4.583333,0.0,12.0,1.0,2.0
50%,0.0,31.25,2.9,0.038827,101.2983,1.0,30.0,1.0,6.0
75%,0.0,39.41667,4.0,0.093617,249.0358,2.0,72.0,1.0,11.0
max,14.0,83.5,13.5,0.90632,3099.505,6.0,540.0,1.0,46.0


In [127]:
df['card'].value_counts()

Unnamed: 0_level_0,count
card,Unnamed: 1_level_1
yes,1023
no,296


In [128]:
for i in range(len(df['owner'])):

  if df.loc[i, "owner"] == "yes":
    df.loc[i, "owner"] = 1

  else:
    df.loc[i, "owner"] = 0

  if df.loc[i, "selfemp"] == "yes":
    df.loc[i, "selfemp"] = 1

  else:
    df.loc[i, "selfemp"] = 0

df

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,yes,0,37.66667,4.5200,0.033270,124.983300,1,0,3,54,1,12
1,yes,0,33.25000,2.4200,0.005217,9.854167,0,0,3,34,1,13
2,yes,0,33.66667,4.5000,0.004156,15.000000,1,0,4,58,1,5
3,yes,0,30.50000,2.5400,0.065214,137.869200,0,0,0,25,1,7
4,yes,0,32.16667,9.7867,0.067051,546.503300,1,0,2,64,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1314,yes,0,33.58333,4.5660,0.002146,7.333333,1,0,0,94,1,19
1315,no,5,23.91667,3.1920,0.000376,0.000000,0,0,3,12,1,5
1316,yes,0,40.58333,4.6000,0.026513,101.298300,1,0,2,1,1,2
1317,yes,0,32.83333,3.7000,0.008999,26.996670,0,1,0,60,1,7


In [129]:
y = df.loc[:, 'card']
df.drop(columns = 'card', inplace = True)
x = df

In [130]:
X_scaled = pd.DataFrame(StandardScaler().fit_transform(x), columns = x.columns, index = x.index)

In [131]:
split = int(0.8 * len(x))

X_train = X_scaled.iloc[:split, :]
X_test = X_scaled.iloc[split:, :]
Y_train = y.iloc[:split]
Y_test = y.iloc[split:]

# x is a df, y is a series. Don't mention both rows and columns for y as it's 1D

In [132]:
import math

def entropy(leaf):

  entropy = 0

  for category in set(leaf):
    total = len(leaf)
    count = (leaf == category).sum()
    prob = count / total
    entropy += - prob * math.log2(prob)

  return entropy

In [133]:
def ig(parent, left, right):

  parent_entropy = entropy(parent)

  left_weight = len(left) / len(parent)
  right_weight = len(right) / len(parent)

  return parent_entropy - (left_weight * entropy(left) + right_weight * entropy(right))

In [134]:
def divide(x, y, feature, value):

    left_mask = x[feature] <= value
    right_mask = x[feature] > value

    x_left, y_left = x[left_mask], y[left_mask]
    x_right, y_right = x[right_mask], y[right_mask]

    return x_left, y_left, x_right, y_right

In [135]:
def best(x, y):

  best_feature = None
  best_value = None
  best_ig = -float('inf')

  for features in x.columns:

    values = x[features].unique()

    for value in values:

      x_left, y_left, x_right, y_right = divide(x, y, features, value)

      if(len(x_left) == 0 or len(x_right) == 0):
        continue

      else:
        cur_ig = ig(y, y_left, y_right)

        if cur_ig > best_ig:
          best_ig = cur_ig
          best_feature = features
          best_value = value

  return best_feature, best_value

In [136]:
def prediction(leaf):
  return leaf.mode()[0]

In [137]:
def build(x, y, max_depth, cur_depth, min_elements):

  if len(set(y)) == 1:
    return prediction(y)

  if (cur_depth >= max_depth or len(y) < min_elements):
    return prediction(y)

  else:
    best_feature, best_value = best(x, y)

    if best_feature is None:
      return prediction(y)

    else:
      x_left, y_left, x_right, y_right = divide(x, y, best_feature, best_value)

      left_tree = build(x_left, y_left, max_depth, cur_depth + 1, min_elements)
      right_tree = build(x_right, y_right, max_depth, cur_depth + 1, min_elements)

      return {
          "feature" : best_feature,
          "value" : best_value,
          "left_subtree" : left_tree,
          "right_subtree" : right_tree
      }

In [138]:
def predictor(input, tree, columns):

    if not isinstance(tree, dict):
        return tree

    else:
        if not isinstance(input, pd.Series):
            input = pd.Series(input, index=columns)

        if input[tree['feature']] <= tree['value']:
            return predictor(input, tree['left_subtree'], columns)

        else:
            return predictor(input, tree['right_subtree'], columns)

In [139]:
def predict(x, tree, columns):
    return [predictor(row, tree, columns) for row in x.values]

In [140]:
tree = build(X_train, Y_train, 3, 0, 3)

In [141]:
y_pred = predict(X_test, tree, X_test.columns)

In [142]:
def accuracy(y_test, y_pred):
    return np.mean(np.array(y_test) == np.array(y_pred))

print(accuracy(Y_test, y_pred))

0.9886363636363636
