# Лабораторная работа 3 (DecisionTree)

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from utils import regression_cross_validate, display_metrics_table, classification_cross_validate, display_metrics_classification_table

In [15]:
import warnings
warnings.filterwarnings("ignore")

### Regression

#### 1. Обработка данных

In [16]:
df = pd.read_csv('data/Salary_Data.csv')
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32,Male,Bachelor's,Software Engineer,5.0,90000
1,28,Female,Master's,Data Analyst,3.0,65000
2,45,Male,PhD,Senior Manager,15.0,150000
3,36,Female,Bachelor's,Sales Associate,7.0,60000
4,52,Male,Master's,Director,20.0,200000


In [17]:
X, y = df.drop(columns=['Salary', 'Job Title']), df['Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# X_train, y_train = X, y

gender_le = LabelEncoder()
el_le = LabelEncoder()

X_train['Gender'] = gender_le.fit_transform(X_train['Gender'])
X_test['Gender'] = gender_le.transform(X_test['Gender'])

X_train['Education Level'] = el_le.fit_transform(X_train['Education Level'])
X_test['Education Level'] = el_le.transform(X_test['Education Level'])

#### 2. Построение бейзлайна 

Для оценки модели будем использовать метод кросс валидации, который позволяет более качественно оценить полученные метрики.

In [18]:
metrics = regression_cross_validate(DecisionTreeRegressor, X_train.to_numpy(), y_train.to_numpy(), n_folds=5, random_state=42)
display_metrics_table(*metrics)

dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

# Метрики
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Вывод метрик
print("\n=== Результаты на Тесте ===")
print(f"Среднеквадратичная ошибка (MSE): {mse:.2f}")
print(f"Средняя абсолютная ошибка (MAE): {mae:.2f}")
print(f"Коэффициент детерминации (R^2): {r2:.2f}")

| Metric   |            Mean |        Std Dev |
|:---------|----------------:|---------------:|
| MAE      | 12139.1         | 1358.82        |
| MSE      |     3.99397e+08 |    7.96342e+07 |
| R2       |     0.824481    |    0.0501593   |

=== Результаты на Тесте ===
Среднеквадратичная ошибка (MSE): 294647380.23
Средняя абсолютная ошибка (MAE): 11051.17
Коэффициент детерминации (R^2): 0.86


Как можем увидеть значение метрики $R^2$ около 0.86, что означает что около 86% дисперсии данных объясняется моделью.

Сформулируем несколько гипотез, которые могут помочь улучшить качество модели

1) Поменять Encoder категориальных признаков с `LabelEncoder` на `OneHotEncoder`
2) Отмасштабировать численные признаки
3) Добавить параметр глубины в дереве

In [19]:
onehot = OneHotEncoder(sparse_output=False, drop='first')

categorical_features = ['Gender', 'Education Level']

encoded_train_data = onehot.fit_transform(X_train[categorical_features])
encoded_test_data = onehot.transform(X_test[categorical_features])

encoded_df = pd.DataFrame(encoded_train_data, columns=onehot.get_feature_names_out(categorical_features))
X_train_upd = X_train.drop(columns=categorical_features).reset_index(drop=True)
X_train_upd = pd.concat([X_train_upd, encoded_df], axis=1)

encoded_df = pd.DataFrame(encoded_test_data, columns=onehot.get_feature_names_out(categorical_features))
X_test_upd = X_test.drop(columns=categorical_features).reset_index(drop=True)
X_test_upd = pd.concat([X_test_upd, encoded_df], axis=1)


scaler = StandardScaler()
num_features = ['Age', 'Years of Experience']
X_train_upd[num_features] = scaler.fit_transform(X_train[num_features])
X_test_upd[num_features] = scaler.transform(X_test[num_features])

In [20]:
metrics = regression_cross_validate(DecisionTreeRegressor, X_train_upd.to_numpy(), y_train.to_numpy(), n_folds=5, max_depth=5)
display_metrics_table(*metrics)

dt = DecisionTreeRegressor(max_depth=5)
dt.fit(X_train_upd, y_train)

y_pred = dt.predict(X_test_upd)

# Метрики
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Вывод метрик
print("\n=== Результаты на Тесте ===")
print(f"Среднеквадратичная ошибка (MSE): {mse:.2f}")
print(f"Средняя абсолютная ошибка (MAE): {mae:.2f}")
print(f"Коэффициент детерминации (R^2): {r2:.2f}")

| Metric   |            Mean |       Std Dev |
|:---------|----------------:|--------------:|
| MAE      | 12425           | 1148.51       |
| MSE      |     3.34118e+08 |    6.3985e+07 |
| R2       |     0.85484     |    0.0365489  |

=== Результаты на Тесте ===
Среднеквадратичная ошибка (MSE): 318726939.21
Средняя абсолютная ошибка (MAE): 11292.65
Коэффициент детерминации (R^2): 0.85


Можно увидеть, что в целом значения метрик на кросс валидации улучшились. Результаты же на тестовой выборке показывают небольшой спад по метрикам, что может наталкивать на мысль, что модель немного переобучилась.

#### 4. Реализация своего класса

In [21]:
class MyDecisionTreeRegressor:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if len(set(y)) == 1 or (self.max_depth and depth == self.max_depth):
            return np.mean(y)
        best_split = self._find_best_split(X, y)
        if best_split is None:
            return np.mean(y)

        left_indices = X[:, best_split['feature']] <= best_split['value']
        right_indices = ~left_indices
        left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return {
            'feature': best_split['feature'],
            'value': best_split['value'],
            'left': left_tree,
            'right': right_tree
        }

    def _find_best_split(self, X, y):
        best_split = None
        best_score = float('inf')
        
        for feature in range(X.shape[1]):  # Для каждого признака
            possible_values = set(X[:, feature])  # Уникальные значения признака
            for value in possible_values:
                left_indices = X[:, feature] <= value
                right_indices = ~left_indices

                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                left_y, right_y = y[left_indices], y[right_indices]
                # Вычисление ошибки на текущем разрезе (среднеквадратическая ошибка)
                score = self._calculate_split_score(left_y, right_y)

                if score < best_score:
                    best_score = score
                    best_split = {'feature': feature, 'value': value}

        return best_split

    def _calculate_split_score(self, left_y, right_y):
        left_score = np.var(left_y) * len(left_y)
        right_score = np.var(right_y) * len(right_y)
        return left_score + right_score

    def predict(self, X):
        return np.array([self._predict_sample(x, self.tree) for x in X])

    def _predict_sample(self, x, tree):
        if isinstance(tree, dict):
            if x[tree['feature']] <= tree['value']:
                return self._predict_sample(x, tree['left'])
            else:
                return self._predict_sample(x, tree['right'])
        else:
            return tree

In [22]:
metrics = regression_cross_validate(MyDecisionTreeRegressor, X_train.to_numpy(), y_train.to_numpy(), n_folds=5)
display_metrics_table(*metrics)

dt = MyDecisionTreeRegressor()
dt.fit(X_train.to_numpy(), y_train.to_numpy())

y_pred = dt.predict(X_test.to_numpy())

# Метрики
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Вывод метрик
print("\n=== Результаты на Тесте ===")
print(f"Среднеквадратичная ошибка (MSE): {mse:.2f}")
print(f"Средняя абсолютная ошибка (MAE): {mae:.2f}")
print(f"Коэффициент детерминации (R^2): {r2:.2f}")

| Metric   |            Mean |        Std Dev |
|:---------|----------------:|---------------:|
| MAE      | 11985.7         | 1500.75        |
| MSE      |     3.94464e+08 |    8.99586e+07 |
| R2       |     0.826332    |    0.0542233   |

=== Результаты на Тесте ===
Среднеквадратичная ошибка (MSE): 296376103.63
Средняя абсолютная ошибка (MAE): 11263.93
Коэффициент детерминации (R^2): 0.86


In [23]:
metrics = regression_cross_validate(MyDecisionTreeRegressor, X_train_upd.to_numpy(), y_train.to_numpy(), n_folds=5, max_depth=5)
display_metrics_table(*metrics)

dt = MyDecisionTreeRegressor(max_depth=5)
dt.fit(X_train_upd.to_numpy(), y_train)

y_pred = dt.predict(X_test_upd.to_numpy())

# Метрики
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Вывод метрик
print("\n=== Результаты на Тесте ===")
print(f"Среднеквадратичная ошибка (MSE): {mse:.2f}")
print(f"Средняя абсолютная ошибка (MAE): {mae:.2f}")
print(f"Коэффициент детерминации (R^2): {r2:.2f}")

| Metric   |            Mean |        Std Dev |
|:---------|----------------:|---------------:|
| MAE      | 12695           | 1107.77        |
| MSE      |     3.48129e+08 |    6.80466e+07 |
| R2       |     0.849397    |    0.0353542   |

=== Результаты на Тесте ===
Среднеквадратичная ошибка (MSE): 327237577.51
Средняя абсолютная ошибка (MAE): 11505.42
Коэффициент детерминации (R^2): 0.85


В собственной реализации алгоритма решающего дерева наблюдается такая же тенденция как и у модели Scikit-Learn. (На валидации метрики подросли после улучшений, но на тесте немного упали)

### Classification

#### 1. Обработка данных

In [24]:
df = pd.read_csv('data/Student_Depression_Dataset.csv')
df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


In [26]:
X, y = df.drop(columns=['Depression', 'id']), df['Depression']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

categorical_features = ['Gender', 'City', 'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']
num_features = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours', 'Financial Stress', ]

le = OrdinalEncoder(handle_unknown='use_encoded_value',
                    unknown_value=99)

X_train[categorical_features] = le.fit_transform(X_train[categorical_features])
X_test[categorical_features] = le.transform(X_test[categorical_features])

imputer = SimpleImputer(strategy='most_frequent') 
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

In [27]:
metrics = classification_cross_validate(DecisionTreeClassifier, X_train.to_numpy(), y_train.to_numpy(), n_folds=5)
display_metrics_classification_table(*metrics)

linear_model = DecisionTreeClassifier()  # Выбираем количество соседей
linear_model.fit(X_train, y_train)

y_pred = linear_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Выводим результаты
print("\n=== Результаты на Тесте ===")
print(f"1. Accuracy: {accuracy:.2%}")
print(f"2. Precision: {precision:.2%}")
print(f"3. Recall: {recall:.2%}")
print(f"4. F1-score: {f1:.2%}")

| Metric    |     Mean |    Std Dev |
|:----------|---------:|-----------:|
| Accuracy  | 0.768315 | 0.00331648 |
| Precision | 0.768839 | 0.00289642 |
| Recall    | 0.768315 | 0.00331648 |
| F1-score  | 0.7685   | 0.00312689 |

=== Результаты на Тесте ===
1. Accuracy: 76.10%
2. Precision: 76.20%
3. Recall: 76.10%
4. F1-score: 76.14%


In [28]:
onehot = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')

encoded_train_data = onehot.fit_transform(X_train[categorical_features])
encoded_test_data = onehot.transform(X_test[categorical_features])

encoded_df = pd.DataFrame(encoded_train_data, columns=onehot.get_feature_names_out(categorical_features))
X_train_upd = X_train.drop(columns=categorical_features).reset_index(drop=True)
X_train_upd = pd.concat([X_train_upd, encoded_df], axis=1)

encoded_df = pd.DataFrame(encoded_test_data, columns=onehot.get_feature_names_out(categorical_features))
X_test_upd = X_test.drop(columns=categorical_features).reset_index(drop=True)
X_test_upd = pd.concat([X_test_upd, encoded_df], axis=1)


scaler = StandardScaler()
X_train_upd[num_features] = scaler.fit_transform(X_train[num_features])
X_test_upd[num_features] = scaler.transform(X_test[num_features])

In [33]:
metrics = classification_cross_validate(DecisionTreeClassifier, X_train_upd.to_numpy(), y_train.to_numpy(), n_folds=5, max_depth=5)
display_metrics_classification_table(*metrics)

linear_model = DecisionTreeClassifier(max_depth=5)  
linear_model.fit(X_train_upd, y_train)

y_pred = linear_model.predict(X_test_upd)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Выводим результаты
print("\n=== Результаты на Тесте ===")
print(f"1. Accuracy: {accuracy:.2%}")
print(f"2. Precision: {precision:.2%}")
print(f"3. Recall: {recall:.2%}")
print(f"4. F1-score: {f1:.2%}")

| Metric    |     Mean |    Std Dev |
|:----------|---------:|-----------:|
| Accuracy  | 0.827575 | 0.00501815 |
| Precision | 0.827976 | 0.00433141 |
| Recall    | 0.827575 | 0.00501815 |
| F1-score  | 0.82762  | 0.00471135 |

=== Результаты на Тесте ===
1. Accuracy: 81.29%
2. Precision: 81.48%
3. Recall: 81.29%
4. F1-score: 81.35%


In [34]:
class MyDecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None
    
    def fit(self, X, y):
        self.tree = self._build_tree(X, y)
    
    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        unique_classes = np.unique(y)
        
        # Условия остановки
        if len(unique_classes) == 1:
            return unique_classes[0]
        if n_samples <= 1:
            return np.random.choice(unique_classes)
        if self.max_depth and depth >= self.max_depth:
            return self._most_common_class(y)
        
        best_split = self._best_split(X, y)
        left_tree = self._build_tree(X[best_split['left_indices']], y[best_split['left_indices']], depth + 1)
        right_tree = self._build_tree(X[best_split['right_indices']], y[best_split['right_indices']], depth + 1)
        
        return {'feature_index': best_split['feature_index'], 'threshold': best_split['threshold'], 'left': left_tree, 'right': right_tree}

    def _best_split(self, X, y):
        best_info_gain = -float('inf')
        best_split = {}
        n_samples, n_features = X.shape
        
        for feature_index in range(n_features):
            feature_values = X[:, feature_index]
            thresholds = np.unique(feature_values)
            
            for threshold in thresholds:
                left_indices = feature_values <= threshold
                right_indices = feature_values > threshold
                
                if np.sum(left_indices) == 0 or np.sum(right_indices) == 0:
                    continue
                
                info_gain = self._information_gain(y, left_indices, right_indices)
                
                if info_gain > best_info_gain:
                    best_info_gain = info_gain
                    best_split = {'feature_index': feature_index, 'threshold': threshold, 'left_indices': left_indices, 'right_indices': right_indices}
        
        return best_split

    def _information_gain(self, y, left_indices, right_indices):
        left_y = y[left_indices]
        right_y = y[right_indices]
        
        parent_entropy = self._entropy(y)
        left_entropy = self._entropy(left_y)
        right_entropy = self._entropy(right_y)
        
        left_weight = len(left_y) / len(y)
        right_weight = len(right_y) / len(y)
        
        info_gain = parent_entropy - (left_weight * left_entropy + right_weight * right_entropy)
        return info_gain
    
    def _entropy(self, y):
        class_counts = np.bincount(y)
        probabilities = class_counts / len(y)
        return -np.sum(probabilities * np.log2(probabilities + 1e-9))
    
    def _most_common_class(self, y):
        return np.bincount(y).argmax()
    
    def predict(self, X):
        predictions = [self._predict_sample(sample, self.tree) for sample in X]
        return np.array(predictions)
    
    def _predict_sample(self, sample, tree):
        if isinstance(tree, dict):
            feature_value = sample[tree['feature_index']]
            if feature_value <= tree['threshold']:
                return self._predict_sample(sample, tree['left'])
            else:
                return self._predict_sample(sample, tree['right'])
        else:
            return tree

In [36]:
metrics = classification_cross_validate(MyDecisionTreeClassifier, X_train.to_numpy(), y_train.to_numpy(), n_folds=5)
display_metrics_classification_table(*metrics)

dt = MyDecisionTreeClassifier() 
dt.fit(X_train.to_numpy(), y_train)

y_pred = dt.predict(X_test.to_numpy())

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Выводим результаты
print("\n=== Результаты на Тесте ===")
print(f"1. Accuracy: {accuracy:.2%}")
print(f"2. Precision: {precision:.2%}")
print(f"3. Recall: {recall:.2%}")
print(f"4. F1-score: {f1:.2%}")

| Metric    |     Mean |    Std Dev |
|:----------|---------:|-----------:|
| Accuracy  | 0.769845 | 0.00535032 |
| Precision | 0.770035 | 0.00509978 |
| Recall    | 0.769845 | 0.00535032 |
| F1-score  | 0.769919 | 0.0052362  |

=== Результаты на Тесте ===
1. Accuracy: 76.40%
2. Precision: 76.43%
3. Recall: 76.40%
4. F1-score: 76.42%


In [37]:
metrics = classification_cross_validate(MyDecisionTreeClassifier, X_train.to_numpy(), y_train.to_numpy(), n_folds=5, max_depth=5)
display_metrics_classification_table(*metrics)

dt = MyDecisionTreeClassifier(max_depth=5) 
dt.fit(X_train.to_numpy(), y_train)

y_pred = dt.predict(X_test.to_numpy())

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Выводим результаты
print("\n=== Результаты на Тесте ===")
print(f"1. Accuracy: {accuracy:.2%}")
print(f"2. Precision: {precision:.2%}")
print(f"3. Recall: {recall:.2%}")
print(f"4. F1-score: {f1:.2%}")

| Metric    |     Mean |    Std Dev |
|:----------|---------:|-----------:|
| Accuracy  | 0.827766 | 0.0071993  |
| Precision | 0.828129 | 0.00696203 |
| Recall    | 0.827766 | 0.0071993  |
| F1-score  | 0.82789  | 0.00710598 |

=== Результаты на Тесте ===
1. Accuracy: 81.22%
2. Precision: 81.49%
3. Recall: 81.22%
4. F1-score: 81.29%


Результаты показывают что собственная имплементация модели в среднем работает примерно на том же уровне качества, что и модель из `sklearn`

### Заключение

Внесённые изменения, включая нормализацию данных и подбор параметра глубины в алгоритме решающих деревьев, улучшают метрики модели. Проведённые эксперименты демонстрируют, что как собственная реализация, так и применение моделей из sklearn, дают схожие результаты.

| Модель                    |      MSE  |        MAE |      $R^2$ |
|:--------------------------|----------:|-----------:|-----------:|
| Sklearn (до улучшения)    | 2.94e+08  | 11051.23   |  0.86      |
| Sklearn (после улучшения) | 3.187e+08 | 11292.65   |  0.85      |
| Собственная имплементация (до улучшения)   | 2.963e+08 | 11263.04   |  0.86     |
| Собственная имплементация (после улучшения)| 3.272e+08 | 11505.42   |  0.85     |

| Модель                    |  Accuracy |  Precision |     Recall |    F1-score |
|:--------------------------|----------:|-----------:|-----------:|-----------:|
| Sklearn (до улучшения)    |   76.10%  |   76.20%   |  76.10%    |  76.14%    |
| Sklearn (после улучшения) |   81.29%  |   81.48%   |  81.29%    |  81.35%    |
| Собственная имплементация (до улучшения)   |   76.40%  |   76.43%   |  76.40%    |  76.42%    |
| Собственная имплементация (после улучшения)|   81.22%  |   81.49%   |  81.22%    |  81.29%    |