MODEL TRAINING(Random Forest Model)

In [None]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier  # Random Forest model
from sklearn.preprocessing import LabelEncoder  # Label encoding for target variable
from sklearn.metrics import accuracy_score, confusion_matrix  # Evaluation metrics
from sklearn.model_selection import train_test_split  # To split the dataset
import matplotlib.pyplot as plt  # Plotting library
import seaborn as sns  # Data visualization library

In [None]:
# Load training data
df = pd.read_csv('Dataset\\training_data.csv')  # Read the training dataset
df.head()  # Display the first few rows of the dataset to understand its structure

In [None]:
# Encode target variable
encoder = LabelEncoder()  # Initialize the label encoder
df['prognosis'] = encoder.fit_transform(df.prognosis)  # Encode the 'prognosis' column to convert categorical labels to numeric

In [None]:
# Drop unnecessary column
df = df.drop(['Unnamed: 133'], axis=1)  # Drop the 'Unnamed: 133' column, which is not needed for the analysis
df.head()  # Display the first few rows of the dataset after dropping the column to verify changes

In [None]:
# Check correlations
df.corr()  # Calculate and display the correlation matrix to understand relationships between variables

In [None]:
# Check for missing values
df.isnull().sum()  # Check for missing values in each column to ensure data quality

In [None]:
# Define features and target variable
X = df.iloc[:, :-1]  # Select all columns except the last one as features
y = df.iloc[:, -1]  # Select the last column as the target variable

In [None]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Initialize lists to store accuracies
max_depth_values = range(1, 21)
training_accuracy_values = []
validation_accuracy_values = []

In [None]:
# Train RandomForestClassifier with different max_depth values
for max_depth in max_depth_values:
    rfc = RandomForestClassifier(max_depth=max_depth, random_state=0)  # Initialize the Random Forest Classifier with max_depth
    rfc.fit(X_train, y_train)  # Fit the model to the training data to train the classifier
    train_acc = accuracy_score(y_train, rfc.predict(X_train))  # Calculate training accuracy
    val_acc = accuracy_score(y_val, rfc.predict(X_val))  # Calculate validation accuracy
    training_accuracy_values.append(train_acc)
    validation_accuracy_values.append(val_acc)

In [None]:
# Plot the relationship between max_depth and accuracy
plt.plot(max_depth_values, training_accuracy_values, label="Train Accuracy")
plt.plot(max_depth_values, validation_accuracy_values, label="Validation Accuracy")
plt.xlabel('max_depth')
plt.ylabel('accuracy')
plt.title('Relationship between max_depth and accuracy')
plt.legend()
plt.show()

In [None]:
# Load testing data
test = pd.read_csv('Dataset\\test_data.csv')  # Read the testing dataset
test.head()  # Display the first few rows of the testing dataset to understand its structure

In [None]:
# Encode target variable in test data
test['prognosis'] = encoder.transform(test.prognosis)  # Encode the 'prognosis' column in the test data to match training data encoding

In [None]:
# Define test features and target
testx = test.iloc[:, :-1]  # Select all columns except the last one as features in the test data
testy = test.iloc[:, -1]  # Select the last column as the target variable in the test data

In [None]:
# Predict using the trained model with optimal max_depth
optimal_rfc = RandomForestClassifier(max_depth=10, random_state=0)  # Using max_depth=10
optimal_rfc.fit(X_train, y_train)
y_pred = optimal_rfc.predict(testx)  # Predict the target variable for the test data using the trained model

In [None]:
# Evaluate the model
print(f"Accuracy on train data by Random Forest Classifier: {accuracy_score(y_train, optimal_rfc.predict(X_train)) * 100:.2f}%")  # Print the accuracy on the training data to evaluate model performance
print(f"Accuracy on test data by Random Forest Classifier: {accuracy_score(testy, y_pred) * 100:.2f}%")  # Print the accuracy on the test data to evaluate model performance

In [None]:
# Plot confusion matrix
cf_matrix = confusion_matrix(testy, y_pred)  # Compute the confusion matrix to understand the performance of the classifier
plt.figure(figsize=(12, 8))  # Set the figure size for the plot
sns.heatmap(cf_matrix, annot=True, fmt='d')  # Plot the confusion matrix with annotations to visualize the results
plt.title("Confusion Matrix for Random Forest Classifier on Test Data")  # Set the title of the plot for clarity
plt.show()  # Display the plot to visualize the confusion matrix

Random Forest Classifier Without Python Packages

In [1]:
import csv
import random
import math

# Function to load CSV file
def load_csv(filename):
    dataset = []
    with open(filename, 'r') as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            dataset.append(row)
    return dataset

# Function to convert string column to float (if applicable)
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

# Function to convert string column to integer (if applicable)
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = {value: index for index, value in enumerate(unique)}
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

# Function to split dataset into training and testing sets
def train_test_split(dataset, split_ratio):
    train_size = int(len(dataset) * split_ratio)
    train_set = []
    test_set = list(dataset)
    while len(train_set) < train_size:
        index = random.randrange(len(test_set))
        train_set.append(test_set.pop(index))
    return train_set, test_set

# Function to calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

# Function to evaluate an algorithm using a train/test split
def evaluate_algorithm(dataset, algorithm, split_ratio, *args):
    train_set, test_set = train_test_split(dataset, split_ratio)
    test_set_copy = list(test_set)
    predicted = algorithm(train_set, test_set, *args)
    actual = [row[-1] for row in test_set_copy]
    accuracy = accuracy_metric(actual, predicted)
    return accuracy

# Function to make predictions with a Random Forest classifier
def random_forest(train, test, max_depth, min_size, sample_size, n_trees):
    trees = []
    for _ in range(n_trees):
        sample = random.sample(train, k=sample_size)
        tree = build_tree(sample, max_depth, min_size)
        trees.append(tree)
    predictions = [bagging_predict(trees, row) for row in test]
    return predictions

# Function to build a decision tree
def build_tree(train, max_depth, min_size):
    root = get_split(train)
    split(root, max_depth, min_size, 1)
    return root

# Function to split a dataset based on an attribute and an attribute value
def test_split(index, value, dataset):
    left, right = [], []
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

# Function to calculate the Gini index for a split dataset
def gini_index(groups, classes):
    n_instances = float(sum([len(group) for group in groups]))
    gini = 0.0
    for group in groups:
        size = float(len(group))
        if size == 0:
            continue
        score = 0.0
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            score += p * p
        gini += (1.0 - score) * (size / n_instances)
    return gini

# Function to select the best split point for a dataset
def get_split(dataset):
    class_values = list(set(row[-1] for row in dataset))
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    for index in range(len(dataset[0])-1):
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    return {'index': b_index, 'value': b_value, 'groups': b_groups}

# Function to create a terminal node value
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

# Function to create child splits for a node or make terminal
def split(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left)
        split(node['left'], max_depth, min_size, depth+1)
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right)
        split(node['right'], max_depth, min_size, depth+1)

# Function to make a prediction with a list of bagged trees
def bagging_predict(trees, row):
    predictions = [predict(tree, row) for tree in trees]
    return max(set(predictions), key=predictions.count)

# Function to classify a given row using a decision tree
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']

def str_column_to_float(dataset, column):
    for row in dataset:
        value = row[column]
        # Check if the value is numeric
        if isinstance(value, str) and value.replace('.', '', 1).isdigit():  # Check if string is numeric
            row[column] = float(value.strip())
        # Handle non-numeric data or leave it as is
        # You can add additional checks or handling as needed

# Function to convert string column to integer (if applicable)
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = {value: index for index, value in enumerate(unique)}
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

def random_forest(train, test, max_depth, min_size, sample_size, n_trees):
    trees = []
    for _ in range(n_trees):
        sample = random.sample(train, k=int(sample_size))  # Convert sample_size to int
        tree = build_tree(sample, max_depth, min_size)
        trees.append(tree)
    predictions = [bagging_predict(trees, row) for row in test]
    return predictions

# Main code to run the random forest classifier without using external libraries
if __name__ == '__main__':
    # Load dataset
    filename = 'Dataset/training_data.csv'
    dataset = load_csv(filename)

    # Convert string attributes to appropriate types
    for i in range(len(dataset[0])):
        str_column_to_float(dataset, i)

    # Convert class column to integers
    str_column_to_int(dataset, len(dataset[0])-1)  # Assuming last column is the target variable

    # Evaluate algorithm
    split_ratio = 0.8
    max_depth = 10
    min_size = 1
    sample_size = 1.0
    n_trees = 10
    accuracy = evaluate_algorithm(dataset, random_forest, split_ratio, max_depth, min_size, sample_size, n_trees)
    print('Accuracy: %.2f%%' % accuracy)


Accuracy: 100.00%
