## Winters, Alexander (V00970263)

# Problem 6. Cross-Regression

### Sources:

https://www.saedsayad.com/decision_tree_reg.htm

https://python-course.eu/machine-learning/regression-trees-in-python.php

https://en.wikipedia.org/wiki/Decision_tree_learning

https://www.youtube.com/watch?v=UhY5vPfQIrA

## 6.1 Regress

In [1]:
import numpy as np
np.random.seed(1337)

In [2]:
import pandas as pd
# Plotting support
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [3]:
elections_df = pd.read_csv('elections_clean.csv')
elections_df.columns

Index(['votes', 'unemployment', 'med_hh_inc', 'per_capita_inc',
       'poverty_all_ages', 'deep_pov_all', 'deep_pov_children', 'population',
       'total_area', 'pop_density', 'total_male', 'total_female',
       'voter_turnout', 'democrat', 'county', 'state', 'education', 'religion',
       'age_young', 'age_adult', 'age_old', 'ethnic_male', 'ethnic_female'],
      dtype='object')

In [4]:
# Get the label vector
label_vector = elections_df.pop('deep_pov_all')

# Take only the categorial features
categorial_features = ['education', 'religion', 'ethnic_male', 'ethnic_female', 'per_capita_inc']

# We only want the categorial features and our label vector
elections_df = elections_df[categorial_features]
elections_df['deep_pov_all'] = label_vector

In [5]:
def get_feature_variance(elections_df, feature_name, attribute):
    feature_values = np.unique(elections_df[feature_name])
    feature_variance = 0
    
    for value in feature_values:
        subset = elections_df[elections_df[feature_name] == value].reset_index()
        weight = (len(subset) / len(elections_df)) * subset[attribute].var(ddof=0)
        #print(value, weight, subset[attribute].var(ddof=0))
        feature_variance += weight
    
    return feature_variance

In [6]:
def min_variance(elections_df, attribute, categorial_features):
    min_feature_variance = {}
    
    for feature in categorial_features:
        feature_variance = get_feature_variance(elections_df, feature, attribute)
        min_feature_variance[feature] = feature_variance
        
    return min(min_feature_variance, key=min_feature_variance.get)

In [7]:
# Similar to my Problem 2 solution

def ID3_Regression(elections_df, attribute, categorial_features, parent=None):
    # Check if label vector is pure and return it to go back up the tree
    if len(np.unique(elections_df[attribute])) <= 1:
        return np.unique(elections_df[attribute])[0]
    
    # Check if we reached the end of the branch which returns the parent and goes up the tree
    if len(categorial_features) == 0:
        return parent
    
    # Check if split is dirty and return the original mean
    if len(elections_df) == 0:
        return elections_df[attribute].mean()
    
    best_feature = min_variance(elections_df, attribute, categorial_features)
    best_feature_values, value_counts = np.unique(elections_df[best_feature], return_counts=True)
    
    # Make the parent node the most common element
    # Gets rid of values equal to 'None'
    parent = elections_df[attribute].mode()[0]
    
    tree = {best_feature:{}}
    # Remove the best feature from our feature list
    feats = [i for i in categorial_features if i != best_feature]

    for value in best_feature_values:
        subset = elections_df[elections_df[best_feature] == value].reset_index(drop=True)
        tree[best_feature][value] = ID3_Regression(subset, attribute, feats, parent)

    return tree

In [8]:
DTree = ID3_Regression(elections_df, 'deep_pov_all', categorial_features)

In [9]:
import pprint
pprint.pprint(DTree)

{'per_capita_inc': {-2.781910127975709: 0.1020299145,
                    -2.27314560374485: 0.2354295280999999,
                    -2.2457870150581853: 0.126516912,
                    -2.2405257480030576: 0.1905687895,
                    -2.169323267190328: 0.1842655382,
                    -2.164763502409217: 0.2110502738,
                    -2.163535873429688: 0.1453140578,
                    -2.137404913722553: 0.1322330483,
                    -2.1305652665508865: 0.2289709416999999,
                    -2.126882379612297: 0.127209344,
                    -2.122147239262682: 0.204189661,
                    -2.0928595193224706: 0.1780971723,
                    -2.0881243789728554: 0.165010254,
                    -2.06690393518384: 0.1850398116,
                    -2.03726546410662: 0.2172346002999999,
                    -2.0246384231743133: 0.1033566433999999,
                    -1.999559716878204: 0.128,
                    -1.981496033322265: 0.2331310246,
            

In [10]:
# Shuffle data for 70/30 split into training and validation sets
shuffled_dataset = elections_df.sample(frac=1).reset_index(drop=True)
split = int(elections_df.shape[0] * 0.7)

X_train = shuffled_dataset.iloc[:split].reset_index(drop=True)
X_test = shuffled_dataset.iloc[split:].reset_index(drop=True)

In [11]:
def predict(DTree, validation_sample):
    # Check each feature in the sample but ignore missing features
    for feature in list(validation_sample.keys()):
        if feature not in list(DTree.keys()):
            continue
        try:
            result = DTree[feature][validation_sample[feature]]
            # Check if node is leaf and if not go further down tree
            if(type(result) == float): 
                return result
            else:
                return predict(result, validation_sample)

        except:
            return elections_df['deep_pov_all'].mean()
        

In [12]:
def validate_rsme(DTree, validation_set):
    y_actual = validation_set.pop('deep_pov_all')
    validation_set['y_predicted'] = y_actual
    y_predicted = np.array([])
    
    for index, row in validation_set.iterrows():
        result = predict(DTree, validation_set.iloc[index])
        y_predicted = np.append(y_predicted, result)
    
    mse = np.square(np.subtract(y_actual, y_predicted)).mean()
    rmse = np.sqrt(mse)
    return rmse 

In [13]:
train_rmse = validate_rsme(DTree, X_train)
test_rmse = validate_rsme(DTree, X_test)
print("The Training RMSE:", train_rmse)
print("The Test RMSE:", test_rmse)

The Training RMSE: 0.034066377539023154
The Test RMSE: 0.03210919634077819


## 6.2 Cross-Validate

### Sources:

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html

https://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression.html#sphx-glr-auto-examples-tree-plot-tree-regression-py

https://github.com/Soft-Squad/kNN-kMeans-Softmax/blob/main/src/knn-kmeans.py - my CSC421 work

In [14]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import random

In [15]:
elections_df = pd.get_dummies(elections_df, categorial_features[:4])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(elections_df.drop('deep_pov_all', axis=1).values, elections_df['deep_pov_all'].values, train_size=0.7)
RegressionDTree = DecisionTreeRegressor(criterion='squared_error')

In [17]:
def cross_validation(X_train, X_test, y_train, y_test):
    folds = 5
    
    X_train_folds = np.array([])
    y_train_folds = np.array([])
    
    X_train_folds = np.array_split(X_train, folds)
    y_train_folds = np.array_split(y_train, folds)
    
    rsme_values = {}
    
    for i in range(folds):
        X_test = X_train_folds[i]
        y_test = y_train_folds[i]
        X_train = X_train_folds
        y_train = y_train_folds
        
        tmp = np.delete(np.array(X_train, dtype='object'), i, axis=0)
        X_train = np.concatenate((tmp), axis=0)
        y_train = np.delete(np.array(y_train, dtype='object'), i, axis=0)
        y_train = np.concatenate((y_train), axis=0)

        RegressionDTree.fit(X_train, y_train)
        prediction = RegressionDTree.predict(X_test)

        mse = np.square(np.subtract(y_test, prediction)).mean()
        rsme = np.sqrt(mse)
        rsme_values[i] = rsme
        
    return rsme_values
    

In [18]:
cv_results = cross_validation(X_train, X_test, y_train, y_test)

print("5-Fold RMSE:")
for key, value in cv_results.items():
    print("Fold " + str(key + 1) + ": " + str(value))

min_rmse = min(cv_results, key=cv_results.get)

print("\nThe best Regression-Tree is: Fold " + str(min_rmse + 1) + " with RMSE: " + str(cv_results[min_rmse]))

5-Fold RMSE:
Fold 1: 0.02975602812647828
Fold 2: 0.03061717632312898
Fold 3: 0.03149920748484207
Fold 4: 0.03170431343744039
Fold 5: 0.03330438591867778

The best Regression-Tree is: Fold 1 with RMSE: 0.02975602812647828
