In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import time
import matplotlib.pyplot as plt

In [None]:
# Load the datasets in to a pandas dataframe
features = pd.read_csv("Credit_card.csv")
labels = pd.read_csv("Credit_card_label.csv")
labels = labels["label"]
features.head(10)

In [None]:
# Data has lots of NaN values so we need to replaces these so that that we can work with the data
# Fill "NaN" with -1 to indicate that there was no given value
features = features.fillna(-1)

In [None]:
import pandas as pd

# Assuming 'features' is your DataFrame
# Create a dictionary to store the ranges for each column
value_ranges = {}

for column in features.columns:
    column_data = features[column]
    if type(column_data[18]) == str:
        print(column)
        print(column_data.unique())
    else:
        print(column)
        print(max(column_data.unique()), min(column_data.unique()))

In [None]:
# Make all non numeric values map to a numberic one: https://pandas.pydata.org/docs/reference/api/pandas.factorize.html
features[features.select_dtypes(exclude='number').columns] = features.select_dtypes(exclude='number').apply(lambda x: pd.factorize(x)[0])
features

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [None]:
# Decision Tree Classifier
start_train = time.time()
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
train_time = time.time() - start_train
print(f"Decision Tree Classifier training time: {train_time:.4f} seconds")

# Make predictions on the test set
start_test = time.time()
dt_predictions = dt_classifier.predict(X_test)
test_time = time.time() - start_test
print(f"Decision Tree Classifier test time: {test_time:.4f} seconds")

# Evaluate Decision Tree Classifier
dt_acc = accuracy_score(y_test, dt_predictions)
print(f"Single Decision Tree Accuracy: {dt_acc:.4f}")
print("Classification Report:\n", classification_report(y_test, dt_predictions))

In [None]:
# Random Forest Classifier
start_train = time.time()
rf_classifier = RandomForestClassifier(n_estimators=100)
rf_classifier.fit(X_train, y_train)
train_time = time.time() - start_train
print(f"Random Forest Classifier training time: {train_time:.4f} seconds")

# Make predictions on the test set
start_test = time.time()
rf_predictions = rf_classifier.predict(X_test)
test_time = time.time() - start_test
print(f"Random Forest Classifier test time: {test_time:.4f} seconds")

# Evaluate Random Forest Classifier
rf_acc = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Classifier Accuracy: {rf_acc:.4f}")
print("Classification Report:\n", classification_report(y_test, rf_predictions))

In [None]:
# Get each tree from the RF classifier
individual_trees = rf_classifier.estimators_

# Evaluate each tree and store accuracies in a list
tree_accs = []
for tree in individual_trees:
    tree_pred = tree.predict(X_test)
    tree_acc = accuracy_score(y_test, tree_pred)
    tree_accs.append(tree_acc)
sorted_trees = sorted(tree_accs, key=lambda x: x, reverse=True)

In [None]:
sorted_trees[:10]

In [None]:
sorted_trees[-10:]

In [None]:
# plot the distribution
plt.hist(sorted_trees, edgecolor='black', bins=10)

plt.xlabel('Accuracy Score')
plt.ylabel('Num Trees')
plt.title('Accuracy Score for all trees in the Random Forest')
plt.show()

In [None]:
columns = ["N_estimators", "Max_Depth", "Train_Time", "Test_Time", "Accuracy"]
df = pd.DataFrame(columns = columns)
for x in [10, 20, 50, 100, 200, 400]:
    for z in [10, 20]:
        # Decision Tree Classifier
        start_train = time.time()
        dt_classifier = RandomForestClassifier(n_estimators=x, max_depth=z)
        dt_classifier.fit(X_train, y_train)
        train_time = time.time() - start_train
        # Make predictions on the test set
        start_test = time.time()
        dt_predictions = dt_classifier.predict(X_test)
        test_time = time.time() - start_test
        # Evaluate Decision Tree Classifier
        dt_acc = accuracy_score(y_test, dt_predictions)
        
        data = [x, z, train_time, test_time, dt_acc]
        row = pd.DataFrame([data], columns=columns)
        df = pd.concat([df, row], ignore_index=True)


In [None]:
df

In [None]:
df.to_csv('results.csv', index=False)