In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("train.csv")
# ax, fig = plt.subplots()


In [None]:
"year,month,toss,day_match,bat_first,format,fow,score,rpo,result"

In [None]:
years = data['year']
tosses = data['toss']
result = data['result']
day_match = data['day_match']
bat_first = data['bat_first']
format = data['format']
fow = data['fow']
score = data['score']
rpo = data['rpo']


In [None]:
from collections import Counter,deque
Counter([])

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
def get_np_array(file_name):
    label_encoder = None 
    data = pd.read_csv(file_name)
    
    need_label_encoding = ['team','host','opp','month', 'day_match']
    if(label_encoder is None):
        label_encoder = OneHotEncoder(sparse_output = False)
        label_encoder.fit(data[need_label_encoding])
    data_1 = pd.DataFrame(label_encoder.transform(data[need_label_encoding]), columns = label_encoder.get_feature_names_out())
    # print(data_1.shape)
    #merge the two dataframes
    dont_need_label_encoding =  ["year","toss","bat_first","format" ,"fow","score" ,"rpo" ,"result"]
    data_2 = data[dont_need_label_encoding]
    final_data = pd.concat([data_1, data_2], axis=1)
    
    X = final_data.iloc[:,:-1]
    y = final_data.iloc[:,-1:]
    return X.to_numpy(), y.to_numpy()

X_train,y_train = get_np_array('train.csv')
X_test, y_test = get_np_array("test.csv")

#only needed in part (c)
X_val, y_val = get_np_array("val.csv")

types = ['cat','cat','cat',"cat","cat","cont","cat","cat","cat" ,"cont","cont" ,"cont" ]
while(len(types) != X_train.shape[1]):
    types = ['cat'] + types


In [None]:
len(types)

In [None]:

class DTNode:

    def __init__(self, depth, is_leaf = False, value = 0,threshold=None, column = None):

        #to split on column
        self.depth = depth
        self.threshold = threshold
        #add children afterwards
        self.children = None

        #if leaf then also need value
        self.is_leaf = is_leaf
        if(self.is_leaf):
            self.value = value
        
        if(not self.is_leaf):
            self.column = column


    def get_children(self, X):
        '''
        Args:
            X: A single example np array [num_features]
        Returns:
            child: A DTNode
        '''
        if self.is_leaf:
            return self
        elif types[self.column]=="cat":
            # print(self.column, X[self.column],len(self.children))
            if X[self.column] >= len(self.children):
                return self.children[-1]
            return self.children[int(X[self.column])]
        else:
            if X[self.column] <= self.threshold:
                return self.children[0]
            else:
                return self.children[1]
        
def entropy(X,y):
    if(len(y)==0):
        return 0
    entropy=0
    for i in np.unique(y):
        p=np.sum(y==i)/len(y)
        if p==1 or p==0:
            return 0
        entropy-=p*np.log2(p)
    return entropy

def information_gain(X,y,feature,iscat:bool):
    entropy_parent=entropy(X,y)
    final_entropy=0
    if not iscat:
        threshold=np.median(X[:,feature])
        left_X=X[X[:,feature]<=threshold]
        right_X=X[X[:,feature]>threshold]
        left_y=y[X[:,feature]<=threshold]
        right_y=y[X[:,feature]>threshold]
        p=len(left_y)/len(y)
        final_entropy=p*entropy(left_X,left_y)+(1-p)*entropy(right_X,right_y)
    else:
        children = []
        values=np.unique(X[:,feature])
        for i in values:
            child_X = X[X[:, feature] == i]
            child_y = y[X[:, feature] == i]
            children.append((child_X, child_y))
        
        for child_X, child_y in children:
            final_entropy += len(child_y) / len(y) * entropy(child_X, child_y)
    return entropy_parent-final_entropy
def split(X,y,types):
    best_feature=None
    best_ig=-1
    best_threshold=None
    for i in range(len(types)):
        ig=information_gain(X,y,i,types[i]=='cat')
        if(ig>best_ig):
            best_ig=ig
            best_feature=i
            if(types[i]=='cat'):
                best_threshold=None
            else:
                best_threshold=np.median(X[:,i])
    return best_feature,best_threshold

class DTTree:

    def __init__(self):
        #Tree root should be DTNode
        self.root = None

    def fit(self, X, y, types, max_depth = 10):
        '''
        Makes decision tree
        Args:
            X: numpy array of data [num_samples, num_features]
            y: numpy array of classes [num_samples, 1]
            types: list of [num_features] with types as: cat, cont
                eg: if num_features = 4, and last 2 features are continious then
                    types = ['cat','cat','cont','cont']
            max_depth: maximum depth of tree
        Returns:
            None
        '''
        self.root = self.grow_tree(X, y, types, max_depth, 0)
        #TODO
    def grow_tree(self, X, y, types, max_depth, depth):
        if depth == max_depth or len(np.unique(y)) == 1:
            return DTNode(depth, is_leaf = True, value = np.bincount(y.flatten()).argmax())
        else:
            best_col, best_split =split(X, y, types)
            if best_col is None:
                print("best col is none", depth)
                return DTNode(depth, is_leaf = True, value = np.bincount(y.flatten()).argmax())
            if np.unique(X[:,best_col]).shape[0] ==1:
                return DTNode(depth, is_leaf = True, value = np.bincount(y.flatten()).argmax())
            else:
                node = DTNode(depth, is_leaf = False,threshold=best_split, column = best_col)
                if types[best_col] == "cat":
                    node.children = []
                    for i in np.unique(X[:,best_col]):
                        child_X = X[X[:,best_col]==i]
                        child_y = y[X[:,best_col]==i]
                        child = self.grow_tree(child_X, child_y, types, max_depth, depth+1)
                        node.children.append(child)
                    
                else:
                    left_X = X[X[:,best_col]<=best_split]
                    left_y = y[X[:,best_col]<=best_split]
                    right_X = X[X[:,best_col]>best_split]
                    right_y = y[X[:,best_col]>best_split]
                    left_subtree = self.grow_tree(left_X, left_y, types, max_depth, depth+1)
                    right_subtree = self.grow_tree(right_X, right_y, types, max_depth, depth+1)
                    node.children = [left_subtree, right_subtree]
                return node
    
    def __call__(self, X):
        '''
        Predicted classes for X
        Args:
            X: numpy array of data [num_samples, num_features]
        Returns:
            y: [num_samples, 1] predicted classes
        '''
        y_pred = []
        for x in X:
            y_pred.append(self.predict(x))
        return y_pred
    def predict(self, x):
        node = self.root
        while node.is_leaf == False:
            node = node.get_children(x)
        return node.value
    def post_prune(self, X_val, y_val):
        node = self.root
        if not node:
            return node
        bfs = deque([node])
        while bfs:
            curr = bfs.popleft()
            tree_sub = DTTree()
            tree_sub.root = curr
            y_pred = tree_sub(X_val)
            acc = np.sum(y_pred == y_val.flatten())/len(y_val)
            y_p = np.argmax(np.bincount(y_val.flatten()))
            if acc<np.mean(y_p==y_val):
                curr.is_leaf = True
                curr.value = np.bincount(y_val.flatten()).argmax()
            else:
                if curr.children:
                    bfs.extend(curr.children)
        self.root = node
        # pass
 

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np
# from DecisionTree import DecisionTree

X,y = get_np_array("train.csv")
X_test,y_test = get_np_array("test.csv")
X_val,y_val = get_np_array("val.csv")
clf = DecisionTree(max_depth=5)
clf.fit(X_train, y_train)


def accuracy(y_test, y_pred):
    return np.sum(y_test == y_pred) / len(y_test)

y_pred = clf.predict(X_train)
acc = accuracy(y_train, y_pred)
print ("Train Accuracy:", acc)
# y_pred = clf.predict(X_test)
# acc = accuracy(y_test, y_pred)
# print ("Test Accuracy:", acc)
y_pred = clf.predict(X_val)
acc = accuracy(y_val, y_pred)
print ("Val Accuracy:", acc)

In [None]:
for max_depth in [15, 25, 35, 45]:
    print("Max Depth: ",max_depth)
    tree = DTTree()
    tree.fit(X_train,y_train,types, max_depth = max_depth)
    # tree.post_prune(X_val, y_val)
    y_pred = tree(X_train)
    print("Training Accuracy: ",np.mean(y_train==y_pred))
    y_pred = tree(X_test)
    print("Testing Accuracy: ",np.mean(y_test==y_pred))
    print()

In [None]:
X,y = get_np_array2("train.csv")
X_test,y_test = get_np_array2("test.csv")
X_val,y_val = get_np_array2("val.csv")

In [None]:
y_pred=0
def accuracy(y_test, y_pred):
    return np.sum(y_test == y_pred) / len(y_test)
print("Train Accuracy:",accuracy(y_val,y_pred))

# Scikit Part

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
d={}
for depth in {15, 25, 35, 45}:
    tree = DecisionTreeClassifier(max_depth=depth, criterion="entropy")
    tree.fit(X, y)
    print("Depth:", depth)
    
    # Calculate accuracy for the training set
    y_train_pred = tree.predict(X)
    train_accuracy = accuracy_score(y, y_train_pred)
    print("Train Accuracy:", train_accuracy)

    # Calculate accuracy for the test set
    y_test_pred = tree.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print("Test Accuracy:", test_accuracy)

    # Calculate accuracy for the validation set (if defined)
    if 'X_val' in locals() and 'y_val' in locals():
        y_val_pred = tree.predict(X_val)
        val_accuracy = accuracy_score(y_val, y_val_pred)
        print("Val Accuracy:", val_accuracy)
        d[depth]=val_accuracy
    print()
final_depth=max(d,key=d.get)
print("Final Depth:",final_depth)


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
d={}
for ccp in {0.001, 0.01, 0.1, 0.2}:
    tree = DecisionTreeClassifier( criterion="entropy",ccp_alpha=ccp)
    tree.fit(X, y)
    print("CCP:", ccp)
    
    # Calculate accuracy for the training set
    y_train_pred = tree.predict(X)
    train_accuracy = accuracy_score(y, y_train_pred)
    print("Train Accuracy:", train_accuracy)

    # Calculate accuracy for the test set
    y_test_pred = tree.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print("Test Accuracy:", test_accuracy)

    # Calculate accuracy for the validation set (if defined)
    if 'X_val' in locals() and 'y_val' in locals():
        y_val_pred = tree.predict(X_val)
        val_accuracy = accuracy_score(y_val, y_val_pred)
        print("Val Accuracy:", val_accuracy)
        d[ccp]=val_accuracy
    print()
final_ccp=max(d,key=d.get)
print("Final CCP:",final_ccp)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
rf_model = RandomForestClassifier(oob_score=True, random_state=42)

param_grid = {
    'n_estimators': range(50, 351, 100),
    'max_features': [0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
    'min_samples_split': range(2, 11, 2)
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X, y)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
y_train_pred = best_model.predict(X)
train_accuracy = accuracy_score(y, y_train_pred)
if X_val is not None and y_val is not None:
    y_val_pred = best_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
else:
    val_accuracy = None

oob_accuracy = best_model.oob_score_
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Best parameters:", best_params)
print("Training set accuracy:", train_accuracy)
if val_accuracy is not None:
    print("Validation set accuracy:", val_accuracy)
print("Out-of-bag accuracy:", oob_accuracy)
print("Test set accuracy:", test_accuracy)


# Gradient boosted Trees

In [None]:

from sklearn.ensemble import GradientBoostingClassifier

X_train,y_train = get_np_array2('train.csv')
X_test, y_test = get_np_array2("test.csv")
X_val, y_val = get_np_array2("val.csv")

In [None]:
best_model=[]
best_Acc=0
for depth in {5, 10, 15, 20}:
    for min_samples in [2, 4, 6, 8]:
        for max_featue in [0.1, 0.3, 0.5, 0.7, 0.9, 1.0]:
            print("Depth:",depth,"min_samples:",min_samples,"max_feature:",max_featue)
            clf = GradientBoostingClassifier(n_estimators=350, learning_rate=1.0,
                max_depth=depth,min_samples_split=min_samples ,max_features=max_featue,random_state=56).fit(X_train, y_train)
            # print(clf.score(X_train, y_train))
            # print(clf.score(X_test, y_test))
            val_acc=clf.score(X_val, y_val)
            print(val_acc)
            if(val_acc>best_Acc):
                best_Acc=val_acc
                best_model=[depth,min_samples,max_featue]
            print()
print("Best model:\n","Depth: ",best_model[0],"\nmin_samples: ",best_model[1],"\nmax_feature: ",best_model[2],"\nAccuracy: ",best_Acc)

# Decision tree classifier

In [None]:
def entropy(X,y):
    if(len(y)==0):
        return 0
    entropy=0
    for i in np.unique(y):
        p=np.sum(y==i)/len(y)
        if p==1 or p==0:
            return 0
        entropy-=p*np.log2(p)
    return entropy

def information_gain(X,y,feature,iscat:bool):
    entropy_parent=entropy(X,y)
    if not iscat:
        threshold=np.median(X[:,feature])
        left_X=X[X[:,feature]<=threshold]
        right_X=X[X[:,feature]>threshold]
        left_y=y[X[:,feature]<=threshold]
        right_y=y[X[:,feature]>threshold]
        p=len(left_y)/len(y)
        return  p*entropy(left_X,left_y)+(1-p)*entropy(right_X,right_y)
    else:
        children = []
        values=np.unique(X[:,feature])
        for i in values:
            child_X = X[X[:, feature] == i]
            child_y = y[X[:, feature] == i]
            children.append((child_X, child_y))
        final_entropy = 0
        for child_X, child_y in children:
            final_entropy += len(child_y) / len(y) * entropy(child_X, child_y)
        return final_entropy

def split(X,y,types):
    best_feature=None
    best_ig=1
    best_threshold=None
    for i in range(len(types)):
        ig=information_gain(X,y,i,types[i]=='cat')
        if(ig<best_ig):
            best_ig=ig
            best_feature=i
            if(types[i]=='cat'):
                best_threshold=None
            else:
                best_threshold=np.median(X[:,i])
    return best_feature,best_threshold

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np



def get_np_array(file_name):
    label_encoder = None 
    data = pd.read_csv(file_name)
    
    need_label_encoding = ['team','host','opp','month', 'day_match']
    if(label_encoder is None):
        label_encoder = OrdinalEncoder()
        label_encoder.fit(data[need_label_encoding])
    data_1 = pd.DataFrame(label_encoder.transform(data[need_label_encoding]), columns = label_encoder.get_feature_names_out())
    # print(data_1.shape)
    #merge the two dataframes
    dont_need_label_encoding =  ["year","toss","bat_first","format" ,"fow","score" ,"rpo" ,"result"]
    data_2 = data[dont_need_label_encoding]
    final_data = pd.concat([data_1, data_2], axis=1)
    
    X = final_data.iloc[:,:-1]
    y = final_data.iloc[:,-1:]
    return X.to_numpy(), y.to_numpy()

def get_np_array2(file_name):
    label_encoder = None 
    data = pd.read_csv(file_name)
    
    need_label_encoding = ['team','host','opp','month', 'day_match']
    if(label_encoder is None):
        label_encoder = OneHotEncoder(sparse_output = False)
        label_encoder.fit(data[need_label_encoding])
    data_1 = pd.DataFrame(label_encoder.transform(data[need_label_encoding]), columns = label_encoder.get_feature_names_out())
    # print(data_1.shape)
    #merge the two dataframes
    dont_need_label_encoding =  ["year","toss","bat_first","format" ,"fow","score" ,"rpo" ,"result"]
    data_2 = data[dont_need_label_encoding]
    final_data = pd.concat([data_1, data_2], axis=1)
    
    X = final_data.iloc[:,:-1]
    y = final_data.iloc[:,-1:]
    return X.to_numpy(), y.to_numpy()



In [None]:
types = ['cat','cat','cat',"cat","cat","cont","cat","cat","cat" ,"cont","cont" ,"cont" ]
X,y = get_np_array("train.csv")
# X = X[:,:len(types)]
print(X.shape)

In [None]:
np.mean(y==1)

In [None]:

label_encoder = None 
X_train,y_train = get_np_array('train.csv')
label_encoder = None 
# X_train = X_train[:,:len(types)]
X_test, y_test = get_np_array("test.csv")
print(np.mean(y_train==0))
print(np.mean(y_test==0))

In [None]:
from collections import deque
class DTNode:

    def __init__(self, depth, is_leaf = False, value = 0,threshold=None, column = None):

        #to split on column
        self.depth = depth
        self.threshold = threshold
        #add children afterwards
        self.children = None

        #if leaf then also need value
        self.is_leaf = is_leaf
        if(self.is_leaf):
            self.value = value
        
        if(not self.is_leaf):
            self.column = column


    def get_children(self, X):
        '''
        Args:
            X: A single example np array [num_features]
        Returns:
            child: A DTNode
        '''
        if self.is_leaf:
            return self
        elif types[self.column]=="cat":
            # print(self.column, X[self.column],len(self.children))
            if X[self.column] >= len(self.children):
                return self.children[-1]
            return self.children[int(X[self.column])]
        else:
            if X[self.column] <= self.threshold:
                return self.children[0]
            else:
                return self.children[1]
        
def entropy(X,y):
    if(len(y)==0):
        return 0
    entropy=0
    for i in np.unique(y):
        p=np.sum(y==i)/len(y)
        if p==1 or p==0:
            return 0
        entropy-=p*np.log2(p)
    return entropy

def information_gain(X,y,feature,iscat:bool):
    entropy_parent=entropy(X,y)
    final_entropy=0
    if not iscat:
        threshold=np.median(X[:,feature])
        left_X=X[X[:,feature]<=threshold]
        right_X=X[X[:,feature]>threshold]
        left_y=y[X[:,feature]<=threshold]
        right_y=y[X[:,feature]>threshold]
        p=len(left_y)/len(y)
        final_entropy=p*entropy(left_X,left_y)+(1-p)*entropy(right_X,right_y)
    else:
        children = []
        values=np.unique(X[:,feature])
        for i in values:
            child_X = X[X[:, feature] == i]
            child_y = y[X[:, feature] == i]
            children.append((child_X, child_y))
        
        for child_X, child_y in children:
            final_entropy += len(child_y) / len(y) * entropy(child_X, child_y)
    return entropy_parent-final_entropy
def split(X,y,types):
    best_feature=None
    best_ig=-1
    best_threshold=None
    for i in range(len(types)):
        ig=information_gain(X,y,i,types[i]=='cat')
        if(ig>best_ig):
            best_ig=ig
            best_feature=i
            if(types[i]=='cat'):
                best_threshold=None
            else:
                best_threshold=np.median(X[:,i])
    return best_feature,best_threshold

class DTTree:

    def __init__(self):
        #Tree root should be DTNode
        self.root = None

    def fit(self, X, y, types, max_depth = 10):
        '''
        Makes decision tree
        Args:
            X: numpy array of data [num_samples, num_features]
            y: numpy array of classes [num_samples, 1]
            types: list of [num_features] with types as: cat, cont
                eg: if num_features = 4, and last 2 features are continious then
                    types = ['cat','cat','cont','cont']
            max_depth: maximum depth of tree
        Returns:
            None
        '''
        self.root = self.grow_tree(X, y, types, max_depth, 0)
        #TODO
    def grow_tree(self, X, y, types, max_depth, depth):
        if depth == max_depth or len(np.unique(y)) == 1:
            return DTNode(depth, is_leaf = True, value = np.bincount(y.flatten()).argmax())
        else:
            best_col, best_split =split(X, y, types)
            if best_col is None:
                print("best col is none", depth)
                return DTNode(depth, is_leaf = True, value = np.bincount(y.flatten()).argmax())
            if np.unique(X[:,best_col]).shape[0] ==1:
                return DTNode(depth, is_leaf = True, value = np.bincount(y.flatten()).argmax())
            else:
                node = DTNode(depth, is_leaf = False,threshold=best_split, column = best_col)
                if types[best_col] == "cat":
                    node.children = []
                    for i in np.unique(X[:,best_col]):
                        child_X = X[X[:,best_col]==i]
                        child_y = y[X[:,best_col]==i]
                        child = self.grow_tree(child_X, child_y, types, max_depth, depth+1)
                        node.children.append(child)
                    
                else:
                    left_X = X[X[:,best_col]<=best_split]
                    left_y = y[X[:,best_col]<=best_split]
                    right_X = X[X[:,best_col]>best_split]
                    right_y = y[X[:,best_col]>best_split]
                    left_subtree = self.grow_tree(left_X, left_y, types, max_depth, depth+1)
                    right_subtree = self.grow_tree(right_X, right_y, types, max_depth, depth+1)
                    node.children = [left_subtree, right_subtree]
                return node
    
    def __call__(self, X):
        '''
        Predicted classes for X
        Args:
            X: numpy array of data [num_samples, num_features]
        Returns:
            y: [num_samples, 1] predicted classes
        '''
        y_pred = []
        for x in X:
            y_pred.append(self.predict(x))
        return y_pred
    def predict(self, x):
        node = self.root
        while node.is_leaf == False:
            node = node.get_children(x)
        return node.value
    def get_nodes(self):
        if not self.root:
            return []
        bfs = deque([self.root])
        nodes = []
        while bfs:
            curr = bfs.popleft()
            nodes.append(curr)
            if curr.children:
                bfs.extend(curr.children)
        return nodes
    def post_prune(self, X_val, y_val):
        nodes = self.get_nodes()
        best_acc = 0
        best_node = None
        acc = accuracy_score(self(X_val),y_val)
        while 1:
            
            for node in nodes:
                if node.is_leaf:
                    continue
                children = node.children
                node.children = []
                node.is_leaf = True
                node.value = Counter(y_val.flatten()).most_common(1)[0][0]
                y_pred = self(X_val)
                new_acc = accuracy_score(y_pred , y_val)
                if best_acc <new_acc-acc:
                    best_acc = new_acc-acc
                    best_node = node
                    # print(best_acc)
                node.children = children
                node.is_leaf = False
            if best_acc>0:
                best_node.is_leaf = True
                best_node.children = []
                acc = best_acc
                nodes = self.get_nodes()
            else:
                break
        print("Best Accuracy:",acc)
                

 


In [None]:
label_encoder = None 
X_train,y_train = get_np_array('train.csv')
label_encoder = None 
# X_train = X_train[:,:len(types)]
X_test, y_test = get_np_array("test.csv")
print(np.mean(y_train==1))
print(np.mean(y_test==1))
# X_test = X_test[:,:len(types)]
# print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
types = ['cat','cat','cat',"cat","cat","cont","cat","cat","cat" ,"cont","cont" ,"cont" ]
while(len(types) != X_train.shape[1]):
        types = ['cat'] + types
for max_depth in [15,25,35,45]:
        print("Max Depth: ",max_depth)
        tree = DTTree()
        tree.fit(X_train,y_train,types, max_depth = max_depth)
        # tree.post_prune(X_val, y_val)
        y_pred = tree(X_train)
        print("Training Accuracy: ",accuracy_score(y_train, y_pred))
        y_pred = tree(X_test)
        print("Testing Accuracy: ",accuracy_score(y_test, y_pred))
        y_pred = tree(X_val)
        print("Validation Accuracy: ",accuracy_score(y_val, y_pred))
        print()

In [None]:
import matplotlib.pyplot as plt

# Data
depth = [15, 25, 35, 45]
train_accuracy = [0.8243, 0.7892, 0.7443, 0.7089]
test_accuracy = [0.5678, 0.6223, 0.6254, 0.6390]
validation_accuracy = [0.6053, 0.6126, 0.6199, 0.6252]

# Create the plot
plt.figure(figsize=(8, 6))

# Plot lines for each accuracy type
plt.plot(depth, train_accuracy, marker='o', label='Train Accuracy')
plt.plot(depth, test_accuracy, marker='o', label='Test Accuracy')
plt.plot(depth, validation_accuracy, marker='o', label='Validation Accuracy')

# Add labels and legend
plt.xlabel('Depth')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Depth')
plt.legend()

# Show the plot
plt.grid()
plt.show()


In [None]:
label_encoder = None 
X_train,y_train = get_np_array2('train.csv')
label_encoder = None 
# X_train = X_train[:,:len(types)]
X_test, y_test = get_np_array2("test.csv")
# X_test = X_test[:,:len(types)]
# print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
types = ['cat','cat','cat',"cat","cat","cont","cat","cat","cat" ,"cont","cont" ,"cont" ]
while(len(types) != X_train.shape[1]):
        types = ['cat'] + types
for max_depth in [15, 25, 35, 45]:
        print("Max Depth: ",max_depth)
        tree = DTTree()
        tree.fit(X_train,y_train,types, max_depth = max_depth)
        # print(tree.root)
        y_pred = tree(X_train)
        print("Training Accuracy: ",accuracy_score(y_train, y_pred))
        y_pred = tree(X_test)
        print("Testing Accuracy: ",accuracy_score(y_test, y_pred))
        print()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Data for Table 1: Varying Depth
depth = np.array([25, 35, 45, 15])
train_accuracy = np.array([0.9881, 0.9996, 1.0000, 0.8243])
test_accuracy = np.array([0.6246, 0.6184, 0.6070, 0.6070])
val_accuracy = np.array([0.6126, 0.6299, 0.6322, 0.6448])

# Data for Table 2: Varying CCP Values
ccp_values = np.array([0.1, 0.01, 0.2, 0.001])
train_accuracy_ccp = np.array([0.5034, 0.5344, 0.5034, 0.6946])
test_accuracy_ccp = np.array([0.4964, 0.5181, 0.4964, 0.6319])
val_accuracy_ccp = np.array([0.4736, 0.5, 0.4736, 0.6402])

# Sort the data in ascending order
depth_sorted = depth[np.argsort(depth)]
train_accuracy_sorted = train_accuracy[np.argsort(depth)]
test_accuracy_sorted = test_accuracy[np.argsort(depth)]
val_accuracy_sorted = val_accuracy[np.argsort(depth)]

ccp_values_sorted = ccp_values[np.argsort(ccp_values)]
train_accuracy_ccp_sorted = train_accuracy_ccp[np.argsort(ccp_values)]
test_accuracy_ccp_sorted = test_accuracy_ccp[np.argsort(ccp_values)]
val_accuracy_ccp_sorted = val_accuracy_ccp[np.argsort(ccp_values)]

# Plot Table 1: Different Accuracies vs Depth (Ascending)
plt.figure(figsize=(10, 5))
plt.plot(depth_sorted, train_accuracy_sorted, marker='o', label='Train Accuracy')
plt.plot(depth_sorted, test_accuracy_sorted, marker='o', label='Test Accuracy')
plt.plot(depth_sorted, val_accuracy_sorted, marker='o', label='Val Accuracy')
plt.xlabel('Depth ')
plt.ylabel('Accuracy')
plt.title('Accuracies vs Depth ')
plt.legend()
plt.grid(True)
plt.show()

# Plot Table 2: Different Accuracies vs CCP Value (Ascending)
plt.figure(figsize=(10, 5))
plt.plot(ccp_values_sorted, train_accuracy_ccp_sorted, marker='o', label='Train Accuracy')
plt.plot(ccp_values_sorted, test_accuracy_ccp_sorted, marker='o', label='Test Accuracy')
plt.plot(ccp_values_sorted, val_accuracy_ccp_sorted, marker='o', label='Val Accuracy')
plt.xlabel('CCP Value ')
plt.ylabel('Accuracy')
plt.title('Accuracies vs CCP Value')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Data for Part I: Without One-Hot Encoding
depth_part1 = np.array([5, 10, 15, 20, 25])
train_accuracy_part1 = np.array([0.5510, 0.5592, 0.5592, 0.5592, 0.5592])
test_accuracy_part1 = np.array([0.5377, 0.5387, 0.5387, 0.5387, 0.5387])

# Data for Part II: With One-Hot Encoding
depth_part2 = np.array([15, 25, 35, 45])
train_accuracy_part2 = np.array([0.7053, 0.8483, 0.9245, 0.9900])
test_accuracy_part2 = np.array([0.5584, 0.6174, 0.6143, 0.6112])

# Plot Part I: Depth vs Accuracies
plt.figure(figsize=(10, 5))
plt.plot(depth_part1, train_accuracy_part1, marker='o', label='Training Accuracy ')
plt.plot(depth_part1, test_accuracy_part1, marker='o', label='Test Accuracy ')
plt.xlabel('Depth')
plt.ylabel('Accuracy')
plt.title('Depth vs Accuracies (Without One-Hot Encoding)')
plt.legend()
plt.grid(True)
plt.show()

# Plot Part II: Depth vs Accuracies
plt.figure(figsize=(10, 5))
plt.plot(depth_part2, train_accuracy_part2, marker='o', label='Training Accuracy')
plt.plot(depth_part2, test_accuracy_part2, marker='o', label='Test Accuracy ')
plt.xlabel('Depth')
plt.ylabel('Accuracy')
plt.title('Depth vs Accuracies (With One-Hot Encoding)')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

layer_size = [1,5,10,50,100]
f1_average_train = [0.31,0.72,0.79,0.83,0.83]
f1_average_test = [0.28,0.69,0.77,0.79,0.80]

plt.figure(figsize=(10, 5))
plt.plot(layer_size, f1_average_train, marker='o', label='Training F1 Score')
plt.plot(layer_size, f1_average_test, marker='o', label='Test F1 Score')
plt.xlabel('Layer Size')
plt.xscale('log')
plt.ylabel('F1 Score')
plt.title('F1 Score vs Layer Size')
plt.xticks(layer_size, layer_size)
plt.legend()
plt.grid(True)
plt.show()

In [None]:
layers = [[512],[512,256],[512,256,128],[512,256,128,64]]
l_x = [1,2,3,4]
f1_average_train = [0.83,0.79,1.00,1.00]
f1_average_test = [0.81,0.78,0.82,0.83]

plt.figure(figsize=(10, 5))
plt.plot(l_x, f1_average_train, marker='o', label='Training F1 Score')
plt.plot(l_x, f1_average_test, marker='o', label='Test F1 Score')
plt.xlabel('Layers')
plt.ylabel('F1 Score')
plt.title('F1 Score vs Layers (with Sigmoid)')
plt.xticks(l_x, layers)
plt.legend()
plt.grid(True)
plt.show()

In [None]:
layers = [[512],[512,256],[512,256,128],[512,256,128,64]]
l_x = [1,2,3,4]
f1_average_train = [0.69,0.85,0.87,0.89]
f1_average_test = [0.69,0.80,0.83,0.82]

plt.figure(figsize=(10, 5))
plt.plot(l_x, f1_average_train, marker='o', label='Training F1 Score')
plt.plot(l_x, f1_average_test, marker='o', label='Test F1 Score')
plt.xlabel('Layers')
plt.ylabel('F1 Score')
plt.title(' Adaptive Learning')
plt.xticks(l_x, layers)
plt.legend()
plt.grid(True)
plt.show()

In [None]:
layers = [[512],[512,256],[512,256,128],[512,256,128,64]]
l_x = [1,2,3,4]
f1_average_train = [0.80,0.85,0.81,0.83]
f1_average_test = [0.78,0.82,0.80,0.82]

plt.figure(figsize=(10, 5))
plt.plot(l_x, f1_average_train, marker='o', label='Training F1 Score')
plt.plot(l_x, f1_average_test, marker='o', label='Test F1 Score')
plt.xlabel('Layers')
plt.ylabel('F1 Score')
plt.title('F1 Score vs Layers (with ReLu)')
plt.xticks(l_x, layers)
plt.legend()
plt.grid(True)
plt.show()

In [None]:
layers = [[512],[512,256],[512,256,128],[512,256,128,64]]
l_x = [1,2,3,4]
f1_average_train = [0.56,0.60,0.62,0.63]
f1_average_test = [0.54,0.59,0.61,0.62]

plt.figure(figsize=(10, 5))
plt.plot(l_x, f1_average_train, marker='o', label='Training F1 Score')
plt.plot(l_x, f1_average_test, marker='o', label='Test F1 Score')
plt.xlabel('Layers')
plt.ylabel('F1 Score')
plt.title('with MLP')
plt.xticks(l_x, layers)
plt.legend()
plt.grid(True)
plt.show()