In [408]:
%cd "/content/drive/MyDrive/AI_Workshops/Final Assesment/"

/content/drive/MyDrive/AI_Workshops/Final Assesment


In [409]:
import pandas as pd
import numpy as np

In [410]:
dataset = pd.read_csv('Datasets/parkinsons.data')
dataset = dataset.drop(columns=['subject#','total_UPDRS'])
# correlation_matrix = dataset.corr()

# correlation_with_target = correlation_matrix['motor_UPDRS'].sort_values(ascending=False)
# correlation_with_target
print("Dataset Shape:", dataset.shape)
dataset['Jitter'] = dataset[['Jitter(%)', 'Jitter(Abs)', 'Jitter:RAP', 'Jitter:PPQ5', 'Jitter:DDP']].mean(axis=1)
dataset['Shimmer'] = dataset[['Shimmer', 'Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 'Shimmer:APQ11', 'Shimmer:DDA']].mean(axis=1)

# Optional: You can also create a "total" feature by summing or averaging Jitter and Shimmer
dataset['Total_Jitter_Shim'] = dataset[['Jitter', 'Shimmer']].mean(axis=1)  # You could also use sum(axis=1)

# Drop the individual jitter and shimmer columns if you don't need them anymore
dataset = dataset.drop(columns=['Jitter(%)', 'Jitter(Abs)', 'Jitter:RAP', 'Jitter:PPQ5', 'Jitter:DDP',
                                'Shimmer', 'Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5', 'Shimmer:APQ11', 'Shimmer:DDA'])
dataset.dtypes

max_value = dataset['motor_UPDRS'].max()
min_value = dataset['motor_UPDRS'].min()

print(max_value,min_value)

Dataset Shape: (5875, 20)
39.511 5.0377


In [411]:
class Node():
    def __init__(self, threshold=None, left=None, right=None, var_red=None, value=None, feature_index=None):
        self.threshold = threshold
        self.left = left
        self.right = right
        self.var_red = var_red
        self.feature_index = feature_index
        self.value = value

class DecisionTreeRegressorCustom():
    def __init__(self, max_depth=2, min_samples_split=2):
        self.root = None
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth

    def build_tree(self, X, Y, current_depth=0):
        num_samples, num_features = X.shape
        best_split = {}

        if num_samples >= self.min_samples_split and current_depth <= self.max_depth:
            best_split = self.get_best_split(X, Y, num_samples, num_features)

            if best_split["var_red"] > 0:
                left_subtree = self.build_tree(best_split["dataset_left"][0], best_split["dataset_left"][1], current_depth + 1)
                right_subtree = self.build_tree(best_split["dataset_right"][0], best_split["dataset_right"][1], current_depth + 1)

                return Node(feature_index=best_split["feature_index"], threshold=best_split["threshold"],
                            left=left_subtree, right=right_subtree, var_red=best_split["var_red"])

        leaf_value = self.calculate_leaf_value(Y)
        return Node(value=leaf_value)

    def variance_reduction(self, parent, l_child, r_child):
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        reduction = np.var(parent) - (weight_l * np.var(l_child) + weight_r * np.var(r_child))
        return reduction

    def get_best_split(self, X, Y, num_samples, num_features):
        best_split = {}
        max_info_gain = -float('inf')

        for feature_index in range(num_features):
            feature_values = X[:, feature_index]  # Use numpy slicing instead of pandas
            possible_threshold = np.unique(feature_values)

            for threshold in possible_threshold:
                dataset_left, dataset_right = self.split(X, Y, feature_index, threshold)

                if len(dataset_left) > 0 and len(dataset_right) > 0:
                    left_y, right_y = dataset_left[1], dataset_right[1]

                    curr_mse_red = self.mean_squared_error(Y, left_y, right_y)

                    if curr_mse_red > max_info_gain:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["var_red"] = curr_mse_red
                        max_info_gain = curr_mse_red

        return best_split

    def split(self, X, Y, feature_index, threshold):
        left_mask = X[:, feature_index] <= threshold
        right_mask = ~left_mask

        X_left, X_right = X[left_mask], X[right_mask]
        Y_left, Y_right = Y[left_mask], Y[right_mask]

        return (X_left, Y_left), (X_right, Y_right)

    def calculate_leaf_value(self, Y):
        return np.mean(Y)

    def print_tree(self, tree=None, indent="  "):
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)
        else:
            print(f"X_{tree.feature_index} <= {tree.threshold} ?")
            print(f"{indent}left:")
            self.print_tree(tree.left, indent + indent)
            print(f"{indent}right:")
            self.print_tree(tree.right, indent + indent)

    def fit(self, X, Y):
        X = X.values
        Y = Y.values
        self.root = self.build_tree(X, Y)


    def mean_squared_error(self, parent, l_child, r_child):
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)

        mse_parent = np.mean((parent - np.mean(parent))**2)
        mse_left = np.mean((l_child - np.mean(l_child))**2)
        mse_right = np.mean((r_child - np.mean(r_child))**2)

        weighted_mse = weight_l * mse_left + weight_r * mse_right
        return mse_parent - weighted_mse

    def predict(self, X):
      X = X.values if hasattr(X, 'values') else X  # Convert pandas DataFrame to numpy array
      return np.array([self._predict(x) for x in X])

    def _predict(self, x, node=None):
        if node is None:
            node = self.root
        if node.value is not None:  # Reached leaf node
            return node.value
        feature_val = x[node.feature_index]
        if feature_val <= node.threshold:
            return self._predict(x, node.left)
        else:
            return self._predict(x, node.right)
    def get_params(self, deep=True):
        return {
            "max_depth": self.max_depth,
            "min_samples_split": self.min_samples_split
        }
    def set_params(self, **params):
      """
      This method will allow you to set the hyperparameters of the custom decision tree.
      """
      for param, value in params.items():
          if param in self.__dict__:
              setattr(self, param, value)
      return self

In [412]:
from sklearn.model_selection import train_test_split
# Define features (X) and target (y)
X = dataset.drop('motor_UPDRS', axis=1)  # features
y = dataset['motor_UPDRS']  # target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the sizes of the datasets
print(f"Train set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"yTrain set size: {len(y_train)}")
print(f"yTest set size: {len(y_test)}")


Train set size: 4700
Test set size: 1175
yTrain set size: 4700
yTest set size: 1175


In [413]:
# Best parameters: {'max_depth': 7, 'min_samples_split': 2} <- Result From GridSearch
# max_depth = 7 , min_sample_split = 2 gets RMSE Train R²: 0.802, Train RMSE: 3.636 Test R²: 0.760, Test RMSE: 3.914
# max_depth = 10 , min_sample_split = 2 gets  Train R²: 0.918, Train RMSE: 2.338 Test R²: 0.857, Test RMSE: 3.019
# max_depth = 10 , min_sample_split = 4 gets  Train R²: 0.918, Train RMSE: 2.343 Test R²: 0.858, Test RMSE: 3.006

regressor_custom = DecisionTreeRegressorCustom(min_samples_split=2, max_depth=10)
regressor_custom.fit(X_train,y_train)
regressor_custom.print_tree()

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


X_0 <= 65.0 ?
  left:
X_0 <= 63.0 ?
    left:
X_0 <= 62.0 ?
        left:
X_0 <= 56.0 ?
                left:
X_0 <= 36.0 ?
                                left:
X_2 <= 52.422 ?
                                                                left:
X_2 <= 24.422 ?
                                                                                                                                left:
X_2 <= 17.45 ?
                                                                                                                                                                                                                                                                left:
X_2 <= 11.48 ?
                                                                                                                                                                                                                                                                                                                      

In [414]:
# print(X_test.dtypes)

# y_pred = regressor_custom.predict(X_test)

# # Actual true values
# y_true = y_test

# # R-squared (R²) calculation
# ss_total = np.sum((y_true - np.mean(y_true)) ** 2)  # Total sum of squares
# ss_residual = np.sum((y_true - y_pred) ** 2)        # Residual sum of squares

# r2 = 1 - (ss_residual / ss_total)
# print(f"R-squared (R²): {r2:.4f}")

# mse = mean_squared_error(y_true, y_pred)
# rmse = np.sqrt(mse)
# print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
# print(f"Mean Squared Error (RMSE): {mse:.4f}")


# # USING Variance:
# # R-squared (R²): 0.4108
# # Root Mean Squared Error (RMSE): 6.1328
# # Mean Squared Error (RMSE): 37.6109

# # USING MSE:
# regressor_custom.evaluate(X_test, y_test)

# print("X data type:", type(X))  # Should be a numpy ndarray or pandas DataFrame
# print("X shape:", X.shape)
# print("X sample values:", X[:5])  # Print a few samples of X to check values

# print("X data type:", type(X_test))  # Should be a numpy ndarray or pandas DataFrame
# print("X sample values:", X[:5])  # Print a few samples of X to check values
# X_test['age'].unique()

y_train_pred = regressor_custom.predict(X_train)
y_test_pred = regressor_custom.predict(X_test)


from sklearn.metrics import r2_score, mean_squared_error

# For training set
train_r2 = r2_score(y_train, y_train_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))

# For test set
test_r2 = r2_score(y_test, y_test_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print(f"Train R²: {train_r2:.3f}, Train RMSE: {train_rmse:.3f}")
print(f"Test R²: {test_r2:.3f}, Test RMSE: {test_rmse:.3f}")

#

Train R²: 0.918, Train RMSE: 2.343
Test R²: 0.858, Test RMSE: 3.006


In [421]:
# Comparasion with sklearn's Decision Tree Regressor Vs Mine
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Initialize the regressor
regressor = DecisionTreeRegressor(max_depth=3, min_samples_split=3)

# Train the model
regressor.fit(X_train, y_train)

# Make predictions
y_pred = regressor.predict(X_test)

# Actual true values
y_true = y_test

# R-squared (R²) calculation
ss_total = np.sum((y_true - np.mean(y_true)) ** 2)  # Total sum of squares
ss_residual = np.sum((y_true - y_pred) ** 2)        # Residual sum of squares

r2 = 1 - (ss_residual / ss_total)
print(f"R-squared (R²): {r2:.4f}")

# Mean Squared Error (MSE) and Root Mean Squared Error (RMSE)
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Define parameter ranges
max_depth_values = [3, 5, 7, 10]
min_samples_split_values = [2, 4, 6, 8]

best_r2 = -np.inf
best_params = None

# Loop through all parameter combinations
for max_depth in max_depth_values:
    for min_samples_split in min_samples_split_values:
        # Initialize and train the regressor
        regressor = DecisionTreeRegressor(max_depth=max_depth, min_samples_split=min_samples_split)
        regressor.fit(X_train, y_train)

        # Make predictions
        y_pred = regressor.predict(X_test)
        y_true = y_test

        # R-squared (R²) calculation
        ss_total = np.sum((y_true - np.mean(y_true)) ** 2)
        ss_residual = np.sum((y_true - y_pred) ** 2)
        r2 = 1 - (ss_residual / ss_total)

        # Root Mean Squared Error (RMSE)
        mse = mean_squared_error(y_true, y_pred)
        rmse = np.sqrt(mse)

        print(f"max_depth={max_depth}, min_samples_split={min_samples_split} → R²: {r2:.4f}, RMSE: {rmse:.4f}")

        # Save best params
        if r2 > best_r2:
            best_r2 = r2
            best_params = (max_depth, min_samples_split)

print(f"\nBest parameters: max_depth={best_params[0]}, min_samples_split={best_params[1]}")


R-squared (R²): 0.3455
Root Mean Squared Error (RMSE): 6.4637
max_depth=3, min_samples_split=2 → R²: 0.3455, RMSE: 6.4637
max_depth=3, min_samples_split=4 → R²: 0.3455, RMSE: 6.4637
max_depth=3, min_samples_split=6 → R²: 0.3455, RMSE: 6.4637
max_depth=3, min_samples_split=8 → R²: 0.3455, RMSE: 6.4637
max_depth=5, min_samples_split=2 → R²: 0.4941, RMSE: 5.6824
max_depth=5, min_samples_split=4 → R²: 0.4942, RMSE: 5.6821
max_depth=5, min_samples_split=6 → R²: 0.4942, RMSE: 5.6821
max_depth=5, min_samples_split=8 → R²: 0.4942, RMSE: 5.6821
max_depth=7, min_samples_split=2 → R²: 0.6754, RMSE: 4.5515
max_depth=7, min_samples_split=4 → R²: 0.6756, RMSE: 4.5505
max_depth=7, min_samples_split=6 → R²: 0.6756, RMSE: 4.5505
max_depth=7, min_samples_split=8 → R²: 0.6756, RMSE: 4.5507
max_depth=10, min_samples_split=2 → R²: 0.8463, RMSE: 3.1325
max_depth=10, min_samples_split=4 → R²: 0.8436, RMSE: 3.1596
max_depth=10, min_samples_split=6 → R²: 0.8515, RMSE: 3.0790
max_depth=10, min_samples_split=8 →

In [416]:
# Cross Validation :
# from sklearn.model_selection import cross_val_score
# import numpy as np

# # Assuming you've already defined your custom regressor and have the dataset (X, y)
# regressor = DecisionTreeRegressor(min_samples_split=3, max_depth=3)

# # Perform cross-validation with neg_mean_squared_error scoring
# scores = cross_val_score(regressor, X, y, cv=5, scoring='neg_mean_squared_error')

# # Convert negative MSE to RMSE
# rmse_cv = np.sqrt(-scores.mean())  # Since cross_val_score gives negative MSE

# print(f"Cross-validated RMSE: {rmse_cv:.4f}")


# # my resulted around 9.4711

In [417]:
# #Testing Performance With Random Forest
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error
# import numpy as np

# # Initialize the RandomForestRegressor
# rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# # Fit the model on the training data
# rf_regressor.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred_rf = rf_regressor.predict(X_test)

# # R-squared (R²) calculation
# ss_total_rf = np.sum((y_test - np.mean(y_test)) ** 2)  # Total sum of squares
# ss_residual_rf = np.sum((y_test - y_pred_rf) ** 2)        # Residual sum of squares

# r2_rf = 1 - (ss_residual_rf / ss_total_rf)
# print(f"Random Forest R-squared (R²): {r2_rf:.4f}")

# # RMSE Calculation
# mse_rf = mean_squared_error(y_test, y_pred_rf)
# rmse_rf = np.sqrt(mse_rf)
# print(f"Random Forest Root Mean Squared Error (RMSE): {rmse_rf:.4f}")


In [418]:
# Testing Performance With Linear Regression
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error
# from sklearn.model_selection import train_test_split
# import numpy as np

# # Assuming your dataset is loaded and ready in 'dataset'
# X = dataset.drop('motor_UPDRS', axis=1)  # Features
# y = dataset['motor_UPDRS']  # Target

# # Split the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Initialize the Linear Regression model
# lr_model = LinearRegression()

# # Fit the model
# lr_model.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = lr_model.predict(X_test)

# # R-squared (R²) calculation
# ss_total = np.sum((y_test - np.mean(y_test)) ** 2)  # Total sum of squares
# ss_residual = np.sum((y_test - y_pred) ** 2)        # Residual sum of squares

# r2 = 1 - (ss_residual / ss_total)
# print(f"Linear Regression R-squared (R²): {r2:.4f}")

# # RMSE Calculation
# mse = mean_squared_error(y_test, y_pred)
# rmse = np.sqrt(mse)
# print(f"Linear Regression Root Mean Squared Error (RMSE): {rmse:.4f}")



R²: 0.9211258141301926
RMSE: 2.9563979055082585
Feature Importance:
          Feature  Importance
0             age    0.638018
1             sex    0.103324
17            DFA    0.078528
2       test_time    0.067506
15            HNR    0.037014
16           RPDE    0.031767
4     Jitter(Abs)    0.012472
13    Shimmer:DDA    0.009466
5      Jitter:RAP    0.007313
12  Shimmer:APQ11    0.003346
10   Shimmer:APQ3    0.002952
18            PPE    0.002299
8         Shimmer    0.001828
11   Shimmer:APQ5    0.001821
6     Jitter:PPQ5    0.001031
14            NHR    0.000830
3       Jitter(%)    0.000293
7      Jitter:DDP    0.000107
9     Shimmer(dB)    0.000085

In [419]:
# from sklearn.model_selection import GridSearchCV

# # Define the parameter grid
# param_grid = {
#     'max_depth': [2, 3, 4, 5, 6, 7],
#     'min_samples_split': [2, 3, 4, 5]
# }

# # Create the regressor object (you can use your custom regressor, make sure it follows sklearn's API)
# regressor_custom = DecisionTreeRegressorCustom()

# # Use GridSearchCV to find the best parameters
# grid_search = GridSearchCV(estimator=regressor_custom, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')
# grid_search.fit(X_train, y_train)

# # Get the best parameters and print them
# best_params = grid_search.best_params_
# print(f"Best parameters: {best_params}")


# # Results show:
# # Best parameters: {'max_depth': 7, 'min_samples_split': 2}
