MODEL TRAINING(Random Forest Model)

In [None]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier  # Random Forest model
from sklearn.preprocessing import LabelEncoder  # Label encoding for target variable
from sklearn.metrics import accuracy_score, confusion_matrix  # Evaluation metrics
from sklearn.model_selection import train_test_split  # To split the dataset
import matplotlib.pyplot as plt  # Plotting library
import seaborn as sns  # Data visualization library

In [None]:
# Load training data
df = pd.read_csv('Dataset\\training_data.csv')  # Read the training dataset
df.head()  # Display the first few rows of the dataset to understand its structure

In [None]:
# Encode target variable
encoder = LabelEncoder()  # Initialize the label encoder
df['prognosis'] = encoder.fit_transform(df.prognosis)  # Encode the 'prognosis' column to convert categorical labels to numeric

In [None]:
# Drop unnecessary column
df = df.drop(['Unnamed: 133'], axis=1)  # Drop the 'Unnamed: 133' column, which is not needed for the analysis
df.head()  # Display the first few rows of the dataset after dropping the column to verify changes

In [None]:
# Check correlations
df.corr()  # Calculate and display the correlation matrix to understand relationships between variables

In [None]:
# Check for missing values
df.isnull().sum()  # Check for missing values in each column to ensure data quality

In [None]:
# Define features and target variable
X = df.iloc[:, :-1]  # Select all columns except the last one as features
y = df.iloc[:, -1]  # Select the last column as the target variable

In [None]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Initialize lists to store accuracies
max_depth_values = range(1, 21)
training_accuracy_values = []
validation_accuracy_values = []

In [None]:
# Train RandomForestClassifier with different max_depth values
for max_depth in max_depth_values:
    rfc = RandomForestClassifier(max_depth=max_depth, random_state=0)  # Initialize the Random Forest Classifier with max_depth
    rfc.fit(X_train, y_train)  # Fit the model to the training data to train the classifier
    train_acc = accuracy_score(y_train, rfc.predict(X_train))  # Calculate training accuracy
    val_acc = accuracy_score(y_val, rfc.predict(X_val))  # Calculate validation accuracy
    training_accuracy_values.append(train_acc)
    validation_accuracy_values.append(val_acc)

In [None]:
# Plot the relationship between max_depth and accuracy
plt.plot(max_depth_values, training_accuracy_values, label="Train Accuracy")
plt.plot(max_depth_values, validation_accuracy_values, label="Validation Accuracy")
plt.xlabel('max_depth')
plt.ylabel('accuracy')
plt.title('Relationship between max_depth and accuracy')
plt.legend()
plt.show()

In [None]:
# Load testing data
test = pd.read_csv('Dataset\\test_data.csv')  # Read the testing dataset
test.head()  # Display the first few rows of the testing dataset to understand its structure

In [None]:
# Encode target variable in test data
test['prognosis'] = encoder.transform(test.prognosis)  # Encode the 'prognosis' column in the test data to match training data encoding

In [None]:
# Define test features and target
testx = test.iloc[:, :-1]  # Select all columns except the last one as features in the test data
testy = test.iloc[:, -1]  # Select the last column as the target variable in the test data

In [None]:
# Predict using the trained model with optimal max_depth
optimal_rfc = RandomForestClassifier(max_depth=10, random_state=0)  # Using max_depth=10
optimal_rfc.fit(X_train, y_train)
y_pred = optimal_rfc.predict(testx)  # Predict the target variable for the test data using the trained model

In [None]:
# Evaluate the model
print(f"Accuracy on train data by Random Forest Classifier: {accuracy_score(y_train, optimal_rfc.predict(X_train)) * 100:.2f}%")  # Print the accuracy on the training data to evaluate model performance
print(f"Accuracy on test data by Random Forest Classifier: {accuracy_score(testy, y_pred) * 100:.2f}%")  # Print the accuracy on the test data to evaluate model performance

In [None]:
# Plot confusion matrix
cf_matrix = confusion_matrix(testy, y_pred)  # Compute the confusion matrix to understand the performance of the classifier
plt.figure(figsize=(12, 8))  # Set the figure size for the plot
sns.heatmap(cf_matrix, annot=True, fmt='d')  # Plot the confusion matrix with annotations to visualize the results
plt.title("Confusion Matrix for Random Forest Classifier on Test Data")  # Set the title of the plot for clarity
plt.show()  # Display the plot to visualize the confusion matrix

Random Forest Classifier Without Python Packages

In [3]:
import math
import random
from collections import Counter

# Function to calculate the Gini index for a dataset
def calculate_gini_index(df):
    total_instances = len(df)
    if total_instances == 0:
        return 0

    label_counts = Counter(df['label'])
    gini = 1.0
    for count in label_counts.values():
        probability = count / total_instances
        gini -= probability ** 2
    
    return gini

# Function to split a dataset based on a feature and value
def split_dataset(df, feature, value):
    left_split = df[df[feature] < value]
    right_split = df[df[feature] >= value]
    return left_split, right_split

# Function to calculate the best split point for a dataset
def get_best_split(df, sample_size):
    # Ensure sample_size is an integer
    sample_size = int(sample_size)
    
    # Get a random sample of features
    features = random.sample(df.columns[:-1].tolist(), sample_size)
    
    best_gini = float('inf')
    best_feature = None
    best_value = None

    for feature in features:
        values = df[feature].unique()
        for value in values:
            left_split, right_split = split_dataset(df, feature, value)
            gini = (len(left_split) / len(df)) * calculate_gini_index(left_split) + \
                   (len(right_split) / len(df)) * calculate_gini_index(right_split)
            if gini < best_gini:
                best_gini, best_feature, best_value = gini, feature, value
    
    return best_feature, best_value

# Tree node class
class TreeNode:
    def __init__(self, feature=None, value=None, left=None, right=None, prediction=None):
        self.feature = feature
        self.value = value
        self.left = left
        self.right = right
        self.prediction = prediction

# Function to build a decision tree
def build_tree(df, max_depth, min_size, sample_size, depth=0):
    labels = df['label'].tolist()
    if len(set(labels)) == 1:
        return TreeNode(prediction=labels[0])
    if depth >= max_depth or len(df) <= min_size:
        most_common_label = Counter(labels).most_common(1)[0][0]
        return TreeNode(prediction=most_common_label)

    feature, value = get_best_split(df, sample_size)
    if feature is None:
        most_common_label = Counter(labels).most_common(1)[0][0]
        return TreeNode(prediction=most_common_label)

    left_split, right_split = split_dataset(df, feature, value)
    left_subtree = build_tree(left_split, max_depth, min_size, sample_size, depth + 1)
    right_subtree = build_tree(right_split, max_depth, min_size, sample_size, depth + 1)

    return TreeNode(feature=feature, value=value, left=left_subtree, right=right_subtree)

# Function to make predictions with a tree
def predict(tree, instance):
    if tree is None or tree.prediction is not None:
        return tree.prediction
    if instance[tree.feature] < tree.value:
        return predict(tree.left, instance)
    else:
        return predict(tree.right, instance)

# Function to make predictions with a forest
def bagging_predict(trees, instance):
    predictions = [predict(tree, instance) for tree in trees]
    return Counter(predictions).most_common(1)[0][0]

# Function to build a random forest
def build_forest(df, n_trees, max_depth, min_size, sample_size):
    trees = []
    for _ in range(n_trees):
        sample = df.sample(frac=sample_size, replace=True)
        tree = build_tree(sample, max_depth, min_size, sample_size)
        trees.append(tree)
    return trees

# Function to make predictions with a forest for a dataframe
def predict_forest(forest, df):
    predictions = df.apply(lambda row: bagging_predict(forest, row), axis=1)
    return predictions

# Main code
if __name__ == '__main__':
    import pandas as pd

    # Example: Load training and test data
    # Replace with your actual dataset loading code
    X_train = pd.DataFrame({'feature1': [1, 0, 1, 0, 1],
                            'feature2': [1, 0, 1, 1, 0],
                            'label': [1, 0, 1, 0, 1]})

    y_train = X_train['label']
    X_train.drop(columns=['label'], inplace=True)

    X_test = pd.DataFrame({'feature1': [1, 0, 1, 0, 1],
                           'feature2': [1, 0, 1, 1, 0]})
    
    y_test = pd.Series([1, 0, 1, 0, 1])

    # Rename the target column to 'label'
    X_train['label'] = y_train

    # Build the random forest
    n_trees = 10
    max_depth = 5
    min_size = 1
    sample_size = 0.5
    forest = build_forest(X_train, n_trees, max_depth, min_size, sample_size)

    # Make predictions
    predictions = predict_forest(forest, X_test)

    # Calculate accuracy
    accuracy = (predictions == y_test).mean()
    print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 60.00%
