In [1]:
import csv
from zipfile import ZipFile
from io import TextIOWrapper
import pandas as pd
import random

# for tree building
import os, math
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import numpy as np

ModuleNotFoundError: No module named 'sklearn'

In [None]:
from tree import *

reader = ZippedCSVReader("loans.zip")
b = Bank(None, reader)

columns = ["amount", "purpose", "income",
           "American Indian or Alaska Native", "Asian",
           "Black or African American",
           "Native Hawaiian or Other Pacific Islander", "White",
           "decision"]
rows = []
missing_race = set()
for i, loan in enumerate(b.loan_iter()):
    rows.append({c: loan[c] for c in columns})
    if not loan["race"] in columns:
        missing_race.add(loan["race"])
missing_race

# Tree Building

In [None]:
def build_tree(df, outfile, depth=7):
    df = df.copy()

    # convert 'devision' to int
    df['decision'] = df['decision'].apply(lambda x: 1 if x=='approve' else 0)

    # convert amount and income to int
    def convert(x):
        try:
            return int(x)
        except ValueError:
            return np.nan

    # will impute after split
    if "amount" in df.columns:
        df['amount'] = df['amount'].apply(lambda x: convert(x) ) 

    if "income" in df.columns:
        df['income'] = df['income'].apply(lambda x: convert(x) )

    # OneHot encode 'purpose'
    # could also remove this cell and add 'Home improvement', 'Home purchase', 
    #       and 'Refinancing' to column list in place of 'purpose'
    if "purpose" in df.columns:
        enc = OneHotEncoder(handle_unknown='ignore')
        _df = pd.DataFrame(enc.fit_transform(df[['purpose']]).toarray(), columns=enc.categories_)
        _df.columns = [item[0] for item in _df.columns]
        df = df.join(_df)
        df = df.drop(['purpose'], axis=1)

    # split into X, y, and then split into train and test sets
    X, y = df[[col for col in df.columns.tolist() if col!= 'decision']], df['decision']
    X_train, X_test, y_train, y_test = train_test_split(X.copy(), y, test_size=0.2)

    # impute income and split
    if "income" in df.columns:
        imp = SimpleImputer(missing_values=np.nan, strategy='mean')
        X_train.loc[:, 'income'] = imp.fit_transform(X_train[['income']])
        X_test['income'] = imp.transform(X_test[['income']])

    if "amount" in df.columns:
        imp = SimpleImputer(missing_values=np.nan, strategy='mean')
        X_train['amount'] = imp.fit_transform(X_train[['amount']])
        X_test['amount'] = imp.transform(X_test[['amount']])

    dct1 = tree.DecisionTreeClassifier(max_depth=depth, class_weight="balanced")
    dct1.fit(X_train, y_train)

    # should look normal once y isn't all the same class
    dct_text = tree.export_text(dct1, feature_names=X_train.columns.tolist())

    TextIOWrapper(outfile).write(dct_text)

In [2]:
simple = """
|--- amount <= 200
|   |--- income <= 35
|   |   |--- class: 0
|   |--- income >  35
|   |   |--- class: 1
|--- amount >  200
|   |--- income <= 70
|   |   |--- class: 0
|   |--- income >  70
|   |   |--- class: 1
""".strip()

with ZipFile("trees-new.zip", "w") as zf:
    with zf.open("simple.txt", "w") as f:
        TextIOWrapper(f).write(simple)

    df = pd.DataFrame(random.sample(rows, 10000))

    with zf.open("good.txt", "w") as f:
        build_tree(df[["amount", "purpose", "income", "decision"]], f, 5)

    with zf.open("bad.txt", "w") as f:
        build_tree(df, f, 5)
        
    for i in range(7):
        df = pd.DataFrame(random.sample(rows, 1000))
        with zf.open(f"tree{i+1}.txt", "w") as f:
            cols = random.sample(["amount", "purpose", "income"], 2) + ["decision"]
            print(cols)
            build_tree(df[cols], f, 3)

NameError: name 'rows' is not defined

In [3]:
df.head()

NameError: name 'df' is not defined

In [13]:
import csv
from zipfile import ZipFile
from io import TextIOWrapper
import pandas as pd
import random
import os, math


# In[3]:


class ZippedCSVReader:
    def __init__(self,name):
        self.paths=[]
        self.filename=name
        with ZipFile(name) as zf:
            for info in zf.infolist():
                self.paths.append(info.filename)
        self.paths.sort()
    def lines(self, name):
        with ZipFile(self.filename) as zf:
            with zf.open(name) as f:
                for line in TextIOWrapper(f):
                    yield line
        
    def csv_iter(self,CSVname=None):
        if CSVname!=None:
            with ZipFile(self.filename) as zf:
                with zf.open(CSVname) as f:
                    tio=TextIOWrapper(f)
                    reader=csv.DictReader(tio)
                    for row in reader:
                        yield row
        else:
            for CSVname in self.paths:
                with ZipFile(self.filename) as zf:
                    with zf.open(CSVname,"r") as f:
                        tio=TextIOWrapper(f)
                        reader=csv.DictReader(tio)
                        for row in reader:
                            yield row


# In[4]:


data_reader = ZippedCSVReader("mini.zip")
tree_reader = ZippedCSVReader("trees.zip")


# In[5]:


class Loan:
    def __init__(self, amount, purpose, race, income, decision):
        if amount=='' or amount==None:
            self.amount=0
        else:
            self.amount=float(amount)
        self.purpose=purpose
        self.race=race
        if income=='':
            self.income=0
        else:
            self.income=float(income)
        self.decision=decision
        if self.race=="White":
            self.White=1
        else: 
            self.White=0
        if self.race=="Asian":
            self.Asian=1
        else:
            self.Asian=0
        if self.race=="Black or African American":
            self.Black_or_African_American=1
        else:
            self.Black_or_African_American=0
        if self.race=="Native Hawaiian or Other Pacific Islander":
            self.pacific_island=1
        else:
            self.pacific_island=0
        if purpose=="home_purchase":
            self.home_purchase==1
        else:
            self.home_purchase=0
        if purpose=="Refinancing":
            self.refin=1
        else:
            self.refin=0
        if purpose=="Home improvement":
            self.home_improv=1
        else:
            self.home_improv=0
    def __repr__(self):
        return("Loan({}, '{}', '{}', '{}', '{}')".format(self.amount,self.purpose,self.race,self.income,self.decision))

    def __getitem__(self, lookup):
        if lookup=="amount":
            return self.amount
        if lookup=="purpose":
            return self.purpose
        if lookup=="race":
            return self.race
        if lookup=="income":
            return self.income
        if lookup=="decision":
            return self.decision
        if lookup=="White":
            return self.White
        if lookup=="Asian":
            return self.Asian
        if lookup=="Black or African American":
            return self.Black_or_African_American
        if lookup=="Native Hawaiian or Other Pacific Islander":
            return self.pacific_island
        if lookup=="Home purchase":
            return  self.home_purchase
        if lookup=="Refinancing":
            return self.refin
        if lookup=="Home improvement":
            return self.home_improv


# In[6]:


def get_bank_names(reader):
    bank_names=[]
    for row in reader.csv_iter():
        bank_names.append(row["agency_abbr"])
    bank_names=list(set(bank_names))
    bank_names.sort()
    return(bank_names)
class Bank:
    def __init__(self,agency_abbr,reader):
        self.reader=reader
        self.agency_abbr=agency_abbr
        
    def loan_iter(self):
        for row in self.reader.csv_iter():
            if self.agency_abbr==None:
                row['respondent_id']=Loan(row["loan_amount_000s"],row["loan_purpose"], row["applicant_race_name_1"], row["applicant_income_000s"],row["action_taken_name"])
                yield row['respondent_id']
            elif row["agency_abbr"]==self.agency_abbr:
                row['respondent_id']=Loan(row["loan_amount_000s"],row["loan_purpose"], row["applicant_race_name_1"], row["applicant_income_000s"],row["action_taken_name"])
                yield row['respondent_id']
    
    def loan_filter(self, loan_min, loan_max, loan_purpose):
        for row in self.reader.csv_iter():
            if row["loan_amount_000s"]>=loan_min&row["loan_amount_000s"]<=loan_max&row["loan_purpose"]==loan_purpose:
                yield row['respondednt_id']


# In[7]:


class SimplePredictor():
    def __init__(self):
        self.num_approved=0
    
    def predict(self, loan):
        if loan["loan_purpose"]=="Home improvement":
            self.num_approved+=1
            return True
        else:
            return False

    def getApproved(self):
        return self.num_approved


# In[8]:


#class Node copied from https://tyler.caraza-harter.com/cs320/f20/reading/lec-12-search-trees.html
from graphviz import Graph, Digraph

class Node:
    def __init__(self, val=None):
        self.val = val
        self.left = None
        self.right = None
    
    def set_val(self,val):
        self.val=val
    
    def to_graphviz(self, g=None):
        if g == None:
            g = Digraph()
            
        # draw self
        g.node(repr(self.val))
    
        for label, child in [("L", self.left), ("R", self.right)]:
            if child != None:
                # draw child, recursively
                child.to_graphviz(g)
                
                # draw edge from self to child
                g.edge(repr(self.val), repr(child.val), label=label)
        return g
    
    def _repr_svg_(self):
        return self.to_graphviz()._repr_svg_()

class DTree(SimplePredictor):
    def __init__(self):
        SimplePredictor.__init__(self)
        self.root=None
        self.node_depths={}
        self.num_disaproved=0
        self.num_approved = 0
    def readTree(self,reader, path):
        self.root= Node()
        self.node_depths[0]=[self.root]
        
        for line in reader.lines(path):
            line=(line.split("---"))
            depth=(len(line[0].split("   ")))
            line_attributes=line[1].split(" ")
            #reads in lines
            
            parent_name=line[1].split("<=")
            if len(parent_name) == 1:
                parent_name=line[1].split(">")
            parent_n=""
            for i in parent_name:
                parent_n+=i
            #This is the value of the nodes:example is (income 98.50).The values have 2 spaces inbetween variable name and number value
            
            parent=self.node_depths[depth-1][0]
            #This is the path of the nodes:example is root.right.left.left
            
            if parent_n.strip()=="class: 0":
            #if line_attributes[1]+line_attributes[-2].strip()=="class:0":
                parent.set_val("class 0")
            elif parent_n.strip()=="class: 1":
            #elif line_attributes[1]+line_attributes[-2].strip()=="class:1":
                parent.set_val("class 1")   
            #if and elif for cases where node value is "class: 0" or "class: 1"
                
            elif line_attributes[-2]=="<=":
                self.node_depths[depth-1]=[parent, parent.set_val(parent_n.strip())]
                #self.node_depths[depth-1]=[parent, parent.set_val(line_attributes[1]+line_attributes[-2].strip())]
                parent.left=Node()
                self.node_depths[depth]=[parent.left]
            elif line_attributes[-3]==">":
                self.node_depths[depth-1]=[parent, parent.set_val(parent_n.strip())]
                #self.node_depths[depth-1]=[parent, parent.set_val(line_attributes[1]+line_attributes[-2].strip())]
                parent.right=Node()
                self.node_depths[depth]=[parent.right]
            #above 2 elif for creating nodes    
            
        return(self.root)  
        
    def predict(self,data,current_node=None):
        if current_node==None:
            current_node=self.root
        loan_attribute=current_node.val.split("  ")[0]
        loan_att_amount=current_node.val.split("  ")[-1]
        #print(loan_attribute)
        #print(data[loan_attribute])
        #print(type(data[loan_attribute]))
        if loan_attribute=="class 0":
            self.num_disaproved +=1
            return(True)
        elif loan_attribute=="class 1":
            self.num_approved += 1
            return(False)
        elif float(loan_att_amount)>=data[loan_attribute]:
            return(self.predict(data,current_node.left))
        elif float(loan_att_amount)<data[loan_attribute]:
            return(self.predict(data,current_node.right))
    def getDisapproved(self):
        return self.num_disaproved
    
    def getApproved(self):
        return self.num_approved
    


# In[15]:


loan = Loan(40, "Home improvement", "Asian", 120, "approve")
# tree_reader = ZippedCSVReader('trees.zip')
# dtree = DTree()
# dtree.readTree(tree_reader, "bad.txt")
reader = ZippedCSVReader('loans.zip')
b = Bank(None, reader)
li = b.loan_iter()
loans = [next(li) for i in range(100)]
t_count = 0
f_count = 0
for path in ['simple.txt', 'good.txt', 'bad.txt']:
    t_count = 0
    f_count = 0
    print(path)
    tree_reader = ZippedCSVReader('trees.zip')
    dtree = DTree()
    dtree.readTree(tree_reader, path)
    print(path)
    for i, loan in enumerate(loans):
        y = dtree.predict(loan)
        if y == True:
            t_count += 1
        else:
            f_count += 1
    print("num approved: ", dtree.num_approved)
    print("num disapproved: ", dtree.num_disaproved)
    print("t_count: ", t_count)
    print("f_count: ", f_count)

simple.txt
simple.txt
num approved:  78
num disapproved:  22
t_count:  22
f_count:  78
good.txt
good.txt
num approved:  55
num disapproved:  45
t_count:  45
f_count:  55
bad.txt
bad.txt
num approved:  45
num disapproved:  55
t_count:  55
f_count:  45
