In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline

import random
from pprint import pprint

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/red-wine-dataset/wineQualityInfo.txt
/kaggle/input/red-wine-dataset/wineQualityReds.csv


# Decision Tree Functions

In [2]:
import pandas as pd
import random


# 1. Train-Test-Split
def train_test_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    indices = df.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    return train_df, test_df


# 2. Distinguish categorical and continuous features
def determine_type_of_feature(df):
    
    feature_types = []
    n_unique_values_treshold = 15
    for feature in df.columns:
        if feature != "label":
            unique_values = df[feature].unique()
            example_value = unique_values[0]

            if (isinstance(example_value, str)) or (len(unique_values) <= n_unique_values_treshold):
                feature_types.append("categorical")
            else:
                feature_types.append("continuous")
    
    return feature_types


# 3. Accuracy
def calculate_accuracy(predictions, labels):
    predictions_correct = predictions == labels
    accuracy = predictions_correct.mean()
    
    return accuracy

In [3]:
import numpy as np
import pandas as pd
import random


# 1. Decision Tree helper functions 
# (see "decision tree algorithm flow chart.png")

# 1.1 Data pure?
def check_purity(data):
    
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)

    if len(unique_classes) == 1:
        return True
    else:
        return False

    
# 1.2 Classify
def classify_data(data):
    
    label_column = data[:, -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)

    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    
    return classification


# 1.3 Potential splits?
def get_potential_splits(data):
    
    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):  # excluding the last column which is the label
        values = data[:, column_index]
        unique_values = np.unique(values)
        
        potential_splits[column_index] = unique_values
    
    return potential_splits


# 1.4 Lowest Overall Entropy?
def calculate_entropy(data):
    
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True)

    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))
     
    return entropy


def calculate_overall_entropy(data_below, data_above):
    
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n

    overall_entropy =  (p_data_below * calculate_entropy(data_below) 
                      + p_data_above * calculate_entropy(data_above))
    
    return overall_entropy


def determine_best_split(data, potential_splits):
    
    overall_entropy = 9999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below, data_above)
            
            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
    
    return best_split_column, best_split_value


# 1.5 Split data
def split_data(data, split_column, split_value):
    
    split_column_values = data[:, split_column]

    type_of_feature = FEATURE_TYPES[split_column]
    if type_of_feature == "continuous":
        data_below = data[split_column_values <= split_value]
        data_above = data[split_column_values >  split_value]
    
    # feature is categorical   
    else:
        data_below = data[split_column_values == split_value]
        data_above = data[split_column_values != split_value]
    
    return data_below, data_above

In [4]:
def decision_tree_algorithm(df, counter=0, min_samples=2, max_depth=5, random_subspace=None):
    
    # data preparations
    if counter == 0:
        global COLUMN_HEADERS, FEATURE_TYPES
        COLUMN_HEADERS = df.columns
        FEATURE_TYPES = determine_type_of_feature(df)
        data = df.values
    else:
        data = df           
    
    
    # base cases
    if (check_purity(data)) or (len(data) < min_samples) or (counter == max_depth):
        classification = classify_data(data)
        
        return classification

    
    # recursive part
    else:    
        counter += 1

        # helper functions 
        potential_splits = get_potential_splits(data, random_subspace)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)
        
        # check for empty data
        if len(data_below) == 0 or len(data_above) == 0:
            classification = classify_data(data)
            return classification
        
        # determine question
        feature_name = COLUMN_HEADERS[split_column]
        type_of_feature = FEATURE_TYPES[split_column]
        if type_of_feature == "continuous":
            question = "{} <= {}".format(feature_name, split_value)
            
        # feature is categorical
        else:
            question = "{} = {}".format(feature_name, split_value)
        
        # instantiate sub-tree
        sub_tree = {question: []}
        
        # find answers (recursion)
        yes_answer = decision_tree_algorithm(data_below, counter, min_samples, max_depth, random_subspace)
        no_answer = decision_tree_algorithm(data_above, counter, min_samples, max_depth, random_subspace)
        
        # If the answers are the same, then there is no point in asking the qestion.
        # This could happen when the data is classified even though it is not pure
        # yet (min_samples or max_depth base case).
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
        
        return sub_tree

In [5]:
def predict_example(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split(" ")

    # ask question
    if comparison_operator == "<=":
        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    
    # feature is categorical
    else:
        if str(example[feature_name]) == value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]

    # base case
    if not isinstance(answer, dict):
        return answer
    
    # recursive part
    else:
        residual_tree = answer
        return predict_example(example, residual_tree)

    
# 3.2 All examples of the test data
def decision_tree_predictions(test_df, tree):
    predictions = test_df.apply(predict_example, args=(tree,), axis=1)
    return predictions

# Import Data

In [6]:
full_data = pd.read_csv('/kaggle/input/red-wine-dataset/wineQualityReds.csv')
full_data['label'] = full_data['quality']
full_data = full_data.drop('quality', axis=1)

column_names = []
for column in full_data.columns:
    name = column.replace(" ", "_")
    column_names.append(name)
full_data.columns = column_names

full_data.head()

Unnamed: 0,Unnamed:_0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,label
0,1,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,2,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,3,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,4,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,5,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [7]:
def transform_label(value):
    if value <= 5:
        return "bad"
    else:
        return "good"

full_data["label"] = full_data.label.apply(transform_label)

In [8]:
random.seed(0)
train_data, test_data = train_test_split(full_data, test_size=0.2)

# Bootstrap

In [9]:
def bootstrap(train_data, n_bootstrap):
    index = np.random.randint(low=0, high=len(train_data), size=n_bootstrap)
    boot_data = train_data.iloc[index]
    
    return boot_data

In [10]:
bootstrap(train_data, n_bootstrap=100)

Unnamed: 0,Unnamed:_0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,label
399,400,8.7,0.765,0.22,2.3,0.064,9.0,42.0,0.99630,3.10,0.55,9.4,bad
1519,1520,6.6,0.700,0.08,2.6,0.106,14.0,27.0,0.99665,3.44,0.58,10.2,bad
1388,1389,6.6,0.640,0.31,6.1,0.083,7.0,49.0,0.99718,3.35,0.68,10.3,bad
646,647,7.3,0.670,0.05,3.6,0.107,6.0,20.0,0.99720,3.40,0.63,10.1,bad
464,465,11.5,0.315,0.54,2.1,0.084,5.0,15.0,0.99870,2.98,0.70,9.2,good
...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,149,6.9,0.490,0.10,2.3,0.074,12.0,30.0,0.99590,3.42,0.58,10.2,good
415,416,8.6,0.725,0.24,6.6,0.117,31.0,134.0,1.00140,3.32,1.07,9.3,bad
1479,1480,8.2,0.280,0.60,3.0,0.104,10.0,22.0,0.99828,3.39,0.68,10.6,bad
1062,1063,8.0,0.380,0.44,1.9,0.098,6.0,15.0,0.99560,3.30,0.64,11.4,good


# Random Subspace

In [11]:
def get_potential_splits(data, random_subspace):
    
    potential_splits = {}
    _, n_columns = data.shape
    
    col_index = list(range(n_columns-1))
    if random_subspace and random_subspace <= len(col_index):
        col_index = random.sample(population=col_index, k=random_subspace)
    
    for column_index in col_index:  # excluding the last column which is the label
        values = data[:, column_index]
        unique_values = np.unique(values)
        
        potential_splits[column_index] = unique_values
    
    return potential_splits

In [12]:
get_potential_splits(train_data.values, random_subspace=3)

{8: array([0.9900700000000001, 0.99064, 0.9908, 0.9912, 0.9915, 0.99154,
        0.9915700000000001, 0.9916, 0.9916200000000001, 0.9917, 0.99182,
        0.9921, 0.9922, 0.99236, 0.9924, 0.9924200000000001,
        0.9925200000000001, 0.99256, 0.9925799999999999, 0.99264, 0.9927,
        0.9928, 0.9928600000000001, 0.9929399999999999, 0.9931399999999999,
        0.99316, 0.99318, 0.9932200000000001, 0.9932799999999999, 0.9933,
        0.99331, 0.9933200000000001, 0.99334, 0.9934, 0.99344, 0.99346,
        0.9934799999999999, 0.9935200000000001, 0.99354, 0.99356,
        0.9935799999999999, 0.9936, 0.9936200000000001, 0.99364, 0.9937,
        0.99371, 0.99374, 0.9937600000000001, 0.9937799999999999, 0.99379,
        0.9938, 0.99384, 0.99385, 0.9938600000000001, 0.99387,
        0.9938799999999999, 0.99392, 0.9939399999999999, 0.99395,
        0.9939600000000001, 0.99397, 0.9940000000000001,
        0.9940200000000001, 0.99408, 0.9941, 0.9941399999999999, 0.99416,
        0.9941700000000

# Random Forest

In [13]:
def  random_forest_algorithm(train_data, n_trees, n_bootstrap, n_features, max_depth):
    forest = []
    for i in range(n_trees):
        boot_data = bootstrap(train_data, n_bootstrap)
        tree = decision_tree_algorithm(boot_data, max_depth=max_depth, random_subspace=n_features)
        forest.append(tree)
        
    return forest

In [14]:
forest = random_forest_algorithm(train_data, n_trees=4, n_bootstrap=800, n_features=4, max_depth=3)

In [15]:
len(forest)

4

In [16]:
pprint(forest[0])

{'volatile.acidity <= 0.36': [{'alcohol <= 10.7': [{'density <= 0.9965': ['bad',
                                                                          'good']},
                                                   'good']},
                              {'sulphates <= 0.53': ['bad',
                                                     {'Unnamed:_0 <= 820': ['bad',
                                                                            'good']}]}]}


In [17]:
pprint(forest[1])

{'alcohol <= 10.5': [{'sulphates <= 0.62': ['bad',
                                            {'volatile.acidity <= 0.4': ['good',
                                                                         'bad']}]},
                     {'citric.acid <= 0.23': [{'volatile.acidity <= 0.98': ['good',
                                                                            'bad']},
                                              'good']}]}


In [18]:
pprint(forest[2])

{'chlorides <= 0.069': [{'pH <= 3.33': ['good',
                                        {'Unnamed:_0 <= 686': ['bad',
                                                               'good']}]},
                        {'volatile.acidity <= 0.36': ['good',
                                                      {'alcohol <= 10.1': ['bad',
                                                                           'good']}]}]}


In [19]:
pprint(forest[3])

{'alcohol <= 10.6': [{'sulphates <= 0.52': [{'alcohol <= 10.5': ['bad',
                                                                 'good']},
                                            {'fixed.acidity <= 10.0': ['bad',
                                                                       'good']}]},
                     {'volatile.acidity <= 0.865': ['good',
                                                    {'alcohol <= 10.7': ['good',
                                                                         'bad']}]}]}


# Prediction

In [20]:
def random_forest_prediction(test_data, forest):
    pred_data = {}
    for i in range(len(forest)):
        col = "tree...{}".format(i)
        predictions = decision_tree_predictions(test_data, tree=forest[i])
        pred_data[col] = predictions
    
    pred_data = pd.DataFrame(pred_data)
    
    return pred_data.mode(axis=1)[0]

In [21]:
predictions = random_forest_prediction(test_data, forest)

accuracy = calculate_accuracy(predictions, test_data.label)
accuracy

0.709375

In [22]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=4, criterion='entropy', max_depth=3, max_samples=800, random_state=21)
rf_model.fit(train_data.drop('label', axis=1), train_data['label'])

y_pred = rf_model.predict(test_data.drop('label', axis=1))
accuracy = calculate_accuracy(y_pred, test_data.label)
accuracy

0.740625