MODEL TRAINING(Random Forest Model)

In [None]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier  # Random Forest model
from sklearn.preprocessing import LabelEncoder  # Label encoding for target variable
from sklearn.metrics import accuracy_score, confusion_matrix  # Evaluation metrics
from sklearn.model_selection import train_test_split  # To split the dataset
import matplotlib.pyplot as plt  # Plotting library
import seaborn as sns  # Data visualization library

In [None]:
# Load training data
df = pd.read_csv('Dataset\\training_data.csv')  # Read the training dataset
df.head()  # Display the first few rows of the dataset to understand its structure

In [None]:
# Encode target variable
encoder = LabelEncoder()  # Initialize the label encoder
df['prognosis'] = encoder.fit_transform(df.prognosis)  # Encode the 'prognosis' column to convert categorical labels to numeric

In [None]:
# Drop unnecessary column
df = df.drop(['Unnamed: 133'], axis=1)  # Drop the 'Unnamed: 133' column, which is not needed for the analysis
df.head()  # Display the first few rows of the dataset after dropping the column to verify changes

In [None]:
# Check correlations
df.corr()  # Calculate and display the correlation matrix to understand relationships between variables

In [None]:
# Check for missing values
df.isnull().sum()  # Check for missing values in each column to ensure data quality

In [None]:
# Define features and target variable
X = df.iloc[:, :-1]  # Select all columns except the last one as features
y = df.iloc[:, -1]  # Select the last column as the target variable

In [None]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Initialize lists to store accuracies
max_depth_values = range(1, 21)
training_accuracy_values = []
validation_accuracy_values = []

In [None]:
# Train RandomForestClassifier with different max_depth values
for max_depth in max_depth_values:
    rfc = RandomForestClassifier(max_depth=max_depth, random_state=0)  # Initialize the Random Forest Classifier with max_depth
    rfc.fit(X_train, y_train)  # Fit the model to the training data to train the classifier
    train_acc = accuracy_score(y_train, rfc.predict(X_train))  # Calculate training accuracy
    val_acc = accuracy_score(y_val, rfc.predict(X_val))  # Calculate validation accuracy
    training_accuracy_values.append(train_acc)
    validation_accuracy_values.append(val_acc)

In [None]:
# Plot the relationship between max_depth and accuracy
plt.plot(max_depth_values, training_accuracy_values, label="Train Accuracy")
plt.plot(max_depth_values, validation_accuracy_values, label="Validation Accuracy")
plt.xlabel('max_depth')
plt.ylabel('accuracy')
plt.title('Relationship between max_depth and accuracy')
plt.legend()
plt.show()

In [None]:
# Load testing data
test = pd.read_csv('Dataset\\test_data.csv')  # Read the testing dataset
test.head()  # Display the first few rows of the testing dataset to understand its structure

In [None]:
# Encode target variable in test data
test['prognosis'] = encoder.transform(test.prognosis)  # Encode the 'prognosis' column in the test data to match training data encoding

In [None]:
# Define test features and target
testx = test.iloc[:, :-1]  # Select all columns except the last one as features in the test data
testy = test.iloc[:, -1]  # Select the last column as the target variable in the test data

In [None]:
# Predict using the trained model with optimal max_depth
optimal_rfc = RandomForestClassifier(max_depth=10, random_state=0)  # Using max_depth=10
optimal_rfc.fit(X_train, y_train)
y_pred = optimal_rfc.predict(testx)  # Predict the target variable for the test data using the trained model

In [None]:
# Evaluate the model
print(f"Accuracy on train data by Random Forest Classifier: {accuracy_score(y_train, optimal_rfc.predict(X_train)) * 100:.2f}%")  # Print the accuracy on the training data to evaluate model performance
print(f"Accuracy on test data by Random Forest Classifier: {accuracy_score(testy, y_pred) * 100:.2f}%")  # Print the accuracy on the test data to evaluate model performance

In [None]:
# Plot confusion matrix
cf_matrix = confusion_matrix(testy, y_pred)  # Compute the confusion matrix to understand the performance of the classifier
plt.figure(figsize=(12, 8))  # Set the figure size for the plot
sns.heatmap(cf_matrix, annot=True, fmt='d')  # Plot the confusion matrix with annotations to visualize the results
plt.title("Confusion Matrix for Random Forest Classifier on Test Data")  # Set the title of the plot for clarity
plt.show()  # Display the plot to visualize the confusion matrix

Random Forest Classifier Without Python Packages

In [140]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

# Load the dataset
file_path = r'C:\Users\NEDUET\Documents\GitHub\Disease-prediction-from-Symptoms\Notebook\Dataset\training_data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()

# Drop the unnecessary column
data = data.drop(columns=['Unnamed: 133'])

# Check for missing values
missing_values = data.isnull().sum().sum()
print(f'Missing values: {missing_values}')

# Encode the target variable 'prognosis'
label_encoder = LabelEncoder()
data['prognosis'] = label_encoder.fit_transform(data['prognosis'])

# Prepare the feature matrix (X) and target vector (y)
X = data.drop(columns=['prognosis']).values
y = data['prognosis'].values

print(f'Feature matrix shape: {X.shape}, Target vector shape: {y.shape}')

def gini_impurity(y):
    """Calculate the Gini Impurity for a list of labels."""
    unique_classes, class_counts = np.unique(y, return_counts=True)
    impurity = 1.0
    for count in class_counts:
        prob_of_class = count / len(y)
        impurity -= prob_of_class ** 2
    return impurity

def split_data(X, y, feature_index, threshold):
    """Split the dataset based on a feature and threshold."""
    left_mask = X[:, feature_index] <= threshold
    right_mask = X[:, feature_index] > threshold
    return X[left_mask], X[right_mask], y[left_mask], y[right_mask]

def best_split(X, y):
    """Find the best feature and threshold to split the data."""
    best_gini = float('inf')
    best_feature_index = None
    best_threshold = None
    for feature_index in range(X.shape[1]):
        thresholds = np.unique(X[:, feature_index])
        for threshold in thresholds:
            X_left, X_right, y_left, y_right = split_data(X, y, feature_index, threshold)
            if len(y_left) == 0 or len(y_right) == 0:
                continue
            gini_left = gini_impurity(y_left)
            gini_right = gini_impurity(y_right)
            gini = (len(y_left) * gini_left + len(y_right) * gini_right) / len(y)
            if gini < best_gini:
                best_gini = gini
                best_feature_index = feature_index
                best_threshold = threshold
    return best_feature_index, best_threshold

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y, depth=0):
        num_samples, num_features = X.shape
        if num_samples <= 1 or depth == self.max_depth:
            leaf_value = self._most_common_label(y)
            return leaf_value

        feature_index, threshold = best_split(X, y)
        if feature_index is None:
            leaf_value = self._most_common_label(y)
            return leaf_value

        X_left, X_right, y_left, y_right = split_data(X, y, feature_index, threshold)
        if len(y_left) == 0 or len(y_right) == 0:
            leaf_value = self._most_common_label(y)
            return leaf_value

        left_subtree = self.fit(X_left, y_left, depth + 1)
        right_subtree = self.fit(X_right, y_right, depth + 1)
        self.tree = (feature_index, threshold, left_subtree, right_subtree)
        return self.tree

    def predict(self, X):
        predictions = [self._predict(inputs, self.tree) for inputs in X]
        return np.array(predictions)

    def _predict(self, inputs, tree):
        if not isinstance(tree, tuple):
            return tree
        feature_index, threshold, left_subtree, right_subtree = tree
        if inputs[feature_index] <= threshold:
            return self._predict(inputs, left_subtree)
        else:
            return self._predict(inputs, right_subtree)

    def _most_common_label(self, y):
        return np.bincount(y).argmax()

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the decision tree model
tree = DecisionTree(max_depth=10)
tree.fit(X_train, y_train)

# Make predictions on the test set
y_pred = tree.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Perform cross-validation to get the average accuracy
def cross_val_score_custom(model, X, y, cv=5):
    fold_size = len(X) // cv
    scores = []
    for i in range(cv):
        X_val = X[i*fold_size:(i+1)*fold_size]
        y_val = y[i*fold_size:(i+1)*fold_size]
        X_train = np.concatenate([X[:i*fold_size], X[(i+1)*fold_size:]])
        y_train = np.concatenate([y[:i*fold_size], y[(i+1)*fold_size:]])
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        scores.append(accuracy_score(y_val, y_pred))
    return scores

# Evaluate the model using custom cross-validation
cv_scores = cross_val_score_custom(DecisionTree(max_depth=10), X, y, cv=5)
print(f'Cross-validation accuracy: {np.mean(cv_scores) * 100:.2f}%')



Missing values: 0
Feature matrix shape: (4920, 132), Target vector shape: (4920,)
Accuracy: 21.54%
Cross-validation accuracy: 28.25%
