In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV




In [3]:
def load_data(file_path):
    data = np.loadtxt(file_path)
    X = data[:, :-1]  # All columns except the last one
    Y = data[:, -1]   # The last column
    return X, Y

X_train, Y_train = load_data('pa2train.txt')
X_val, Y_val = load_data('pa2validation.txt')
X_test, Y_test = load_data('pa2test.txt')

# Load feature names
with open('pa2features.txt', 'r') as f:
    feature_names = f.read().splitlines()





In [4]:

def bagging(X, Y, size):
    # Create random indices for data sampling
    indices = np.random.randint(0, len(X), size=size)
    return X[indices], Y[indices]


In [9]:
def build_random_forest(X, Y, num_trees, num_features):
    forest = []
    for _ in range(num_trees):
        # Select random features
        features = np.random.choice(X.shape[1], num_features, replace=False)
        X_subset = X[:, features]
        
        # Bagging
        X_bag, Y_bag = bagging(X_subset, Y, size=len(X))
        
        # Train a decision tree on the bagged data and random features
        tree = DecisionTreeClassifier(criterion='gini')
        tree.fit(X_bag, Y_bag)
        
        # Store the tree and its corresponding features
        forest.append((tree, features))
    return forest


In [14]:

# Define the parameter values to be searched
param_grid = {'max_depth': list(range(1, 11)), 'min_samples_split': list(range(2, 11))}

grid = GridSearchCV(DecisionTreeClassifier(criterion='gini'), param_grid, cv=10, scoring='accuracy')

grid.fit(X_val, Y_val)

print(grid.cv_results_)

print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)


{'mean_fit_time': array([0.00220082, 0.00219777, 0.00229936, 0.00195158, 0.00199845,
       0.00199797, 0.0019002 , 0.00199969, 0.00189939, 0.00315235,
       0.00369914, 0.00355647, 0.00299959, 0.00309734, 0.00312719,
       0.00309863, 0.00301843, 0.00300031, 0.00405231, 0.00410061,
       0.00415425, 0.00419924, 0.00410032, 0.00420103, 0.0042516 ,
       0.00439925, 0.00430143, 0.00529847, 0.00514834, 0.00505407,
       0.00500369, 0.00560853, 0.0062958 , 0.00500271, 0.00505433,
       0.00505493, 0.00599604, 0.0061496 , 0.00611625, 0.00600448,
       0.00624807, 0.00615244, 0.00635128, 0.0060992 , 0.00605233,
       0.00703797, 0.00703769, 0.00720272, 0.00725398, 0.00714405,
       0.00695469, 0.00704803, 0.00699849, 0.00704875, 0.00814013,
       0.00805454, 0.00779297, 0.00814881, 0.00794356, 0.00795448,
       0.00789642, 0.0080476 , 0.00795269, 0.00874124, 0.00885196,
       0.00864861, 0.00865529, 0.00873871, 0.00875413, 0.00873971,
       0.00880129, 0.00884979, 0.00945814, 0

In [15]:

# Build the random forest
forest = build_random_forest(X_train, Y_train, num_trees=10, num_features=10)


forest_predictions = []
for tree, features in forest:
    X_test_subset = X_test[:, features]
    predictions = tree.predict(X_test_subset)
    forest_predictions.append(predictions)

# Average Predictions
forest_predictions = np.mean(forest_predictions, axis=0) > 0.5

tree = DecisionTreeClassifier(criterion='gini')
tree.fit(X_train, Y_train)

tree_predictions = tree.predict(X_test)

# Compare the performance
forest_accuracy = accuracy_score(Y_test, forest_predictions)
tree_accuracy = accuracy_score(Y_test, tree_predictions)

print(f"Random Forest Accuracy: {forest_accuracy}")
print(f"Decision Tree Accuracy: {tree_accuracy}")


Random Forest Accuracy: 0.871
Decision Tree Accuracy: 0.827


In [17]:

# Make predictions with the random forest
forest_predictions = []
for tree, features in forest:
    X_test_subset = X_test[:, features]
    predictions = tree.predict(X_test_subset)
    forest_predictions.append(predictions)

forest_predictions = np.mean(forest_predictions, axis=0) > 0.5

tree_predictions = grid.best_estimator_.predict(X_test)

# Compare the performance
forest_accuracy = accuracy_score(Y_test, forest_predictions)
tree_accuracy = accuracy_score(Y_test, tree_predictions)

print(f"Random Forest Accuracy: {forest_accuracy}")
print(f"Decision Tree Accuracy: {tree_accuracy}")


Random Forest Accuracy: 0.871
Decision Tree Accuracy: 0.897
