# <font color="#49699E" size=40>Supervised Learning with Tree-Based Models</font>

# LEARNING OBJECTIVES
# LEARNING MATERIALS


# INTRODUCTION


## Imports

In [None]:
import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import graphviz

from dcss.plotting import plot_knn_decision_boundaries
from dcss.plotting import custom_seaborn

custom_seaborn()

## Preparing the Data


In [None]:
forml = pd.read_csv("../data/vdem_internet_freedom_combined/vdem_fh_combined.csv")

## The Train-Test Split and Cross-Validation


In [None]:
from sklearn.model_selection import train_test_split

X = forml[['v2x_polyarchy', 'v2x_libdem', 'v2x_partipdem', 'v2x_delibdem', 'v2x_egaldem',]]
y = forml[['Total Score']]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23)

# RULES-BASED LEARNING WITH TREES
## Decision Trees


In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from graphviz import Source
from sklearn.preprocessing import LabelEncoder

dem_indices = pd.read_csv("../data/vdem_internet_freedom_combined/dem_indices.csv")

X = dem_indices[[
                 'v2smgovdom_osp', # Government dissemination of false information domestic
                 "v2smgovfilprc_osp", # Government internet filtering in practice
                 "v2smgovsmcenprc_osp", # Government social media censorship in practice
                 "v2smonper_osp", # Diversity of online media perspectives (0 = gov't only, 4 = any perspective)
                 "v2smarrest_osp", # Arrests for political content disseminated online
]]

interpretable_names = [
    'Domestic Misinformation',
    'Internet Filtering',
    'Social Media Censorship',
    'Online Media Diversity',
    'Arrests for Political Content'
]

regime_types = [
    'Autocracy',
    'Democracy',
]


le = LabelEncoder()
labels = le.fit_transform(regime_types)

y = np.where(dem_indices["v2x_regime"] <= 1, 0, 1).copy()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23)

shuffsplit = ShuffleSplit(n_splits=5, test_size=0.3, random_state=42)

dtclass = DecisionTreeClassifier(random_state=0)
dt_scores = cross_val_score(dtclass, X_train, y_train, cv=shuffsplit)
print(dt_scores)
print(f"Mean: {dt_scores.mean()}")

In [None]:
# dem_indices.v2x_regime

In [None]:
from sklearn import preprocessing

dt_fitted = dtclass.fit(X_train, y_train)

export_graphviz(
    dtclass,
    out_file='../graphical_models/classified_1.gv', 
    filled=False,
    rounded=True,
    feature_names=interpretable_names,
    class_names=le.classes_,
)

### What About Overfitting?


In [None]:
dtclass_pruned = DecisionTreeClassifier(max_depth=3, random_state=0)
dt_scores = cross_val_score(dtclass_pruned, X_train, y_train, cv=shuffsplit)
print(dt_scores)
print(f"Mean: {dt_scores.mean()}")

In [None]:
dtclass_pruned.fit(X_train, y_train)

export_graphviz(
    dtclass_pruned,
    out_file='../graphical_models/pruned.gv',
    filled=False,
    rounded=True,
    feature_names=interpretable_names,
    class_names=le.classes_,
)

In [None]:
dtclass_pruned.score(X_test, y_test)

# ENSEMBLE LEARNING


In [None]:
from sklearn.ensemble import BaggingClassifier

bag_of_trees = BaggingClassifier(DecisionTreeClassifier(),
                                n_estimators=100,
                                bootstrap=True,
                                random_state=0)

bt_scores = cross_val_score(bag_of_trees, X_train, y_train, cv=shuffsplit)
print(bt_scores)
print(f"Mean: {bt_scores.mean()}")

## Random Forests


In [None]:
from sklearn.ensemble import RandomForestClassifier

rforest = RandomForestClassifier(n_estimators=100,
                                max_features=2,
                                random_state=0)

rforest_scores = cross_val_score(rforest, X_train, y_train, cv=shuffsplit)
print(rforest_scores)
print(f"Mean: {rforest_scores.mean()}")

In [None]:
rforest.fit(X_train, y_train)

export_graphviz(
    rforest.estimators_[6],
    out_file='../graphical_models/rf_classified.gv',
    filled=False,
    rounded=True,
    feature_names=interpretable_names,
    class_names=le.classes_,
)

## Gradient Boosted Machines


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gboost = GradientBoostingClassifier(n_estimators=100,
                                random_state=0)

gboost_scores = cross_val_score(gboost, X_train, y_train, cv=shuffsplit)
print(gboost_scores)
print(f"Mean: {gboost_scores.mean()}")

In [None]:
model_list = [dtclass,
dtclass_pruned,
bag_of_trees.fit(X_train, y_train),
rforest,
gboost.fit(X_train, y_train)]

for model in model_list:
    print(model.score(X_test, y_test))


# Evaluation Beyond Accuracy
## Balancing False Positives and False Negatives in Classification Models
## Improving Binary Classification with Curves
### Precision-Recall Curves
### Beyond Binary Classifiers 


# CONCLUSION
## Key Points 
