In [None]:
"""                          
Test a learner.  (c) 2015 Tucker Balch                          
                          
Copyright 2018, Georgia Institute of Technology (Georgia Tech)                          
Atlanta, Georgia 30332                          
All Rights Reserved                          
                          
Template code for CS 4646/7646                          
                          
Georgia Tech asserts copyright ownership of this template and all derivative                          
works, including solutions to the projects assigned in this course. Students                          
and other users of this template code are advised not to share it with others                          
or to make it available on publicly viewable websites including repositories                          
such as github and gitlab.  This copyright statement should not be removed                          
or edited.                          
                          
We do grant permission to share solutions privately with non-students such                          
as potential employers. However, sharing with other current or future                          
students of CS 7646 is prohibited and subject to being investigated as a                          
GT honor code violation.                          
                          
-----do not edit anything above this line---                          
"""                          
                          
import numpy as np                          
import math                          
import LinRegLearner as lrl                          
import sys
import DTLearner as dl
import RTLearner as rl
import BagLearner as bl
import InsaneLearner as il
import matplotlib.pyplot as plt
import matplotlib
import time
                          
if __name__=="__main__":                          
    if len(sys.argv) != 2:                          
        print("Usage: python testlearner.py <filename>")                          
        sys.exit(1)  
    if sys.argv[1] == "Data/Istanbul.csv":
        inf = open(sys.argv[1])
        data = np.genfromtxt(inf,delimiter=',')
        data = data[1:,1:]
    else:
        inf = open(sys.argv[1])
        data = np.array(list([map(float,s.strip().split(',')) for s in inf.readlines()]))                         
                          
    # compute how much of the data is training and testing                          
    train_rows = int(0.3* data.shape[0])                          
    test_rows = data.shape[0] - train_rows                          
                          
    # separate out training and testing data                          
    trainX = data[:train_rows,0:-1]                          
    trainY = data[:train_rows,-1]                          
    testX = data[train_rows:,0:-1]                          
    testY = data[train_rows:,-1]
    
    ###question 1
    rmse_arr=[]
    rmse_arr2=[]
    for i in range(1,31):
        learner = dl.DTLearner(leaf_size=i,verbose = True)
        learner.addEvidence(trainX, trainY)
        pred=learner.query(trainX)
        rmse = np.sqrt(np.mean((pred-trainY)**2))
        rmse_arr.append(rmse)
        pred2=learner.query(testX)
        rmse2=np.sqrt(np.mean((pred2-testY)**2))
        rmse_arr2.append(rmse2)
        
    plt.figure(1)
    plt.plot(rmse_arr)
    plt.plot(rmse_arr2)
    plt.xlim(30,0)
    plt.ylim(0,0.015)
    plt.title("RMSE vs Leaf Size: DTLearner")
    plt.xlabel("Leaf Size")
    plt.ylabel("RMSE")
    plt.legend(["In-Sample", "Out-Sample"])
    plt.savefig("q1.png")
    
    # compute how much of the data is training and testing                          
    train_rows = int(0.6* data.shape[0])                          
    test_rows = data.shape[0] - train_rows                          
                          
    # separate out training and testing data                          
    trainX = data[:train_rows,0:-1]                          
    trainY = data[:train_rows,-1]                          
    testX = data[train_rows:,0:-1]                          
    testY = data[train_rows:,-1]
    
    
    ###question2
    rmse_arr3=[]
    rmse_arr4=[]
    for i in range(1,31):
        learner=bl.BagLearner(learner=dl.DTLearner, kwargs={"leaf_size":i}, bags = 15, boost = False, verbose = False)
        learner.addEvidence(trainX, trainY)
        pred=learner.query(trainX)
        rmse = np.sqrt(np.mean((pred-trainY)**2))
        rmse_arr3.append(rmse)
        pred2=learner.query(testX)
        rmse2=np.sqrt(np.mean((pred2-testY)**2))
        rmse_arr4.append(rmse2)
        
    plt.figure(2)
    plt.plot(rmse_arr3)
    plt.plot(rmse_arr4)
    plt.xlim(30,0)
    plt.ylim(0,0.015)
    plt.title("RMSE vs Leaf Size: Bag Learner")
    plt.xlabel("Leaf Size")
    plt.ylabel("RMSE")
    plt.legend(["In-Sample", "Out-Sample"])
    plt.savefig("q2.png")
    
    
    ###question 3: average tree depth
    dl_depth=[]
    rl_depth=[]
    for i in range(0, 31):
        learner = dl.DTLearner(leaf_size=i, verbose=True)
        learner.addEvidence(trainX, trainY)
        depth=int(np.log2(learner.num_leafs()))
        dl_depth.append(depth)

    for i in range(1, 31):
        learner = rl.RTLearner(leaf_size=i, verbose=True)
        learner.addEvidence(trainX, trainY)
        depth=int(np.log2(learner.num_leafs()))
        rl_depth.append(depth)
        
    plt.figure(3)
    plt.plot(dl_depth)
    plt.plot(rl_depth)
    plt.xlim(30,0)
    plt.ylim(0,10)
    plt.title("Average Tree Depth")
    plt.xlabel("Leaf Size")
    plt.ylabel("Level")
    plt.legend(["DTLEarner", "RTLearner"])
    plt.savefig("q3i.png")

    
    ###question 3: build time
    time_dl=[]
    time_rl=[]
    for i in range(1,31):
        start = time.time()
        learner = dl.DTLearner(leaf_size=i, verbose=True)
        learner.addEvidence(trainX, trainY)
        end = time.time()
        time_dl.append(end-start)
        
    for i in range(1,31):
        start = time.time()
        learner = rl.RTLearner(leaf_size=i, verbose=True)
        learner.addEvidence(trainX, trainY)
        end = time.time()
        time_rl.append(end-start)
    
    plt.figure(4)
    plt.plot(time_dl)
    plt.plot(time_rl)
    plt.xlim(30,0)
    plt.ylim(0,0.3)
    plt.title("Time to Build Tree: DTLearner vs RTLearner")
    plt.xlabel("Leaf Size")
    plt.ylabel("Time (s)")
    plt.legend(["DTLearner", "RTLearner"])
    plt.savefig("q3ii.png")                      


In [None]:
"""                           
Test a learner.  (c) 2015 Tucker Balch                           
                           
Copyright 2018, Georgia Institute of Technology (Georgia Tech)                           
Atlanta, Georgia 30332                           
All Rights Reserved                           
                           
Template code for CS 4646/7646                           
                           
Georgia Tech asserts copyright ownership of this template and all derivative                           
works, including solutions to the projects assigned in this course. Students                           
and other users of this template code are advised not to share it with others                           
or to make it available on publicly viewable websites including repositories                           
such as github and gitlab.  This copyright statement should not be removed                           
or edited.                           
                           
We do grant permission to share solutions privately with non-students such                           
as potential employers. However, sharing with other current or future                           
students of CS 7646 is prohibited and subject to being investigated as a                           
GT honor code violation.                           
                           
-----do not edit anything above this line---                           
"""                           
                           
import numpy as np                           
import math
import sys
import DTLearner as dt
import RTLearner as rt
import BagLearner as bl
import InsaneLearner as it
import LinRegLearner as lrl
import matplotlib.pyplot as plt
import time
import matplotlib
matplotlib.use('agg')
                           
if __name__=="__main__":                           
    if len(sys.argv) != 2:
        #print ("Usage: python testlearner.py <filename>")
        sys.exit(1)

    if sys.argv[1] == 'Data/Istanbul.csv':
        inf = open(sys.argv[1])
        data = np.genfromtxt(inf, delimiter=',')
        data = data[1:,1:]

    else:
        inf = open(sys.argv[1])
        data = np.array([map(float,s.strip().split(',')) for s in inf.readlines()])



    train_rows = int(0.3* data.shape[0])
    test_rows = data.shape[0] - train_rows


    trainX = data[:train_rows,0:-1]
    trainY = data[:train_rows,-1]
    testX = data[train_rows:,0:-1]
    testY = data[train_rows:,-1]


    r1 = []
    r2 = []
    t1 = []
    t2 = []
    for i in range(1,51):
        start = time.time()
        learner = dt.DTLearner(leaf_size=i,verbose = True) # create a LinRegLearner
        learner.addEvidence(trainX, trainY) # train it

                           
    # evaluate in sample                           
        predY = learner.query(trainX) # get the predictions
        end = time.time()
        rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])
        r1.append(rmse)
        #print "RMSE: ", rmse  # ans 0.0
        c = np.corrcoef(predY, y=trainY)

        predY = learner.query(testX) # get the predictions
        rmse = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
        r2.append(rmse)


        #print "RMSE: ", rmse
        c = np.corrcoef(predY, y=testY)
        #print "corr: ", c[0,1]


    plt.figure(1)
    plt.title('RMSE metric to assess overfitting in DT Learner')
    plt.xlabel('Leaf Size')
    plt.ylabel('RMSE')
    plt.xlim(0,50)
    plt.ylim(0,0.016)
    a = plt.plot(r)
    b = plt.plot(r2)
    plt.legend(["DTLearner", "RTLearner"])

    plt.savefig("question1.png")




    learner = rt.RTLearner(leaf_size=1, verbose=True)  # create a LinRegLearner
    learner.addEvidence(trainX, trainY)  # train it

    predY = learner.query(trainX)

    rmse = math.sqrt(((trainY - predY) ** 2).sum() / trainY.shape[0])



    c = np.corrcoef(predY, y=trainY)

        # evaluate out of sample
    predY = learner.query(testX)  # get the predictions
    rmse = math.sqrt(((testY - predY) ** 2).sum() / testY.shape[0])
        #print "RMSE: ", rmse
    c = np.corrcoef(predY, y=testY)

### q2

    r3 = []
    r4 = []


    train_rows = int(0.6 * data.shape[0])
    test_rows = data.shape[0] - train_rows

    trainX = data[:train_rows, 0:-1]
    trainY = data[:train_rows, -1]
    testX = data[train_rows:, 0:-1]
    testY = data[train_rows:, -1]

    for i in range(1,51):
    # create a learner and train it

        learner = bl.BagLearner(learner = dt.DTLearner, kwargs = {"leaf_size":i}, bags = 10, boost = False, verbose = False)  # create a LinRegLearner
        learner.addEvidence(trainX, trainY)  # train it



        predY = learner.query(trainX)
        rmse = math.sqrt(((trainY - predY) ** 2).sum() / trainY.shape[0])
        c = np.corrcoef(predY, y=trainY)

        r3.append(rmse)

        predY = learner.query(testX)
        rmse = math.sqrt(((testY - predY) ** 2).sum() / testY.shape[0])
        c = np.corrcoef(predY, y=testY)
        r4.append(rmse)


    plt.figure(2)
    plt.title('RMSE metric to assess overfitting in BagLearner')
    plt.xlabel('Leaf Size')
    plt.ylabel('RMSE')
    plt.xlim(0,50)
    plt.ylim(0, 0.016)
    plt.plot(r3)
    plt.plot(r4)
    plt.legend(["In Sample", "Out of Sample"])

    plt.savefig("question2.png")


    learner = it.InsaneLearner(verbose=False)
    learner.addEvidence(trainX, trainY)

    # evaluate in sample
    predY = learner.query(trainX)  # get the predictions
    rmse = math.sqrt(((trainY - predY) ** 2).sum() / trainY.shape[0])


    c = np.corrcoef(predY, y=trainY)


    predY = learner.query(testX)  # get the predictions
    rmse = math.sqrt(((testY - predY) ** 2).sum() / testY.shape[0])

    c = np.corrcoef(predY, y=testY)


    learner = lrl.LinRegLearner()  # create a LinRegLearner




    for i in range(1,51):
        learner = dt.DTLearner(leaf_size=i, verbose=True)
        learner.addEvidence(trainX, trainY)  # train it
        start = time.time()
        predY = learner.query(trainX)
        predY = learner.query(testX)
        end = time.time()
        t1.append(end-start)

    for i in range(1,51):
        learner = rt.RTLearner(leaf_size=i, verbose=True)
        learner.addEvidence(trainX, trainY)  # train it
        start = time.time()
        predY = learner.query(trainX)
        predY = learner.query(testX)# get the predictions
        end = time.time()
        t2.append(end-start)


    plt.figure(3)
    plt.title('Time metric to compare DT and RT learner')
    plt.xlabel('Leaf Size')
    plt.ylabel('Time')
    plt.xlim(0, 50)
    plt.ylim(0, 0.4)
    plt.plot(t1)
    plt.plot(t2)
    plt.legend(["Time_DT Learner","Time_RT Learner"])

    plt.savefig("question3.png")


    dt_count = []
    for i in range(1, 51):
        learner = dt.DTLearner(leaf_size=i, verbose=True)
        learner.addEvidence(trainX, trainY)  # train it
        predY = learner.query(trainX)
        predY1 = learner.query(testX)

        dt_count.append(int(np.log2(learner.tree_size())))

   #print dt_count

    rt_count = []
    for i in range(1, 51):
        learner = rt.RTLearner(leaf_size=i, verbose=True)
        learner.addEvidence(trainX, trainY)  # train it
        predY = learner.query(trainX)
        predY1 = learner.query(testX)  # get the predictions
        rt_count.append(int(np.log2(learner.tree_size())))

    #print rt_count


    plt.figure(4)
    plt.title('Root to leaf length metric to compare DT and RT learner')
    plt.xlabel('Leaf Size')
    plt.ylabel('Tree Size')
    plt.xlim(0, 50)
    plt.ylim(0, 10)
    plt.plot(dt_count)
    plt.plot(rt_count)
    plt.legend(["DT_Learner", "RT_Learner"])

    plt.savefig("question3_2.png")
    print("hello")