In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as py
import plotly.express as px

from sklearn.tree import DecisionTreeClassifier, export_graphviz
from graphviz import Source

import warnings
warnings.simplefilter(action='ignore', category=Warning)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

![](https://datascience.foundation/img/pdf_images/understanding_decision_trees_with_python_decision_tree.png)datascience.foundation

<center style="font-family:verdana;"><h1 style="font-size:200%; padding: 10px; background: #CD5C5C;"><b style="color:white;">Decision Tree</b></h1></center>


"Explaining Decision Trees for Machine Learning" By z_ai

"In the Machine Learning world, Decision Trees are a kind of non parametric models, that can be used for both classification and regression."

"This means that Decision trees are flexible models that don’t increase their number of parameters as we add more features (if we build them correctly), and they can either output a categorical prediction or a numerical prediction." 

"They are constructed using two kinds of elements: nodes and branches. At each node, one of the features of our data is evaluated in order to split the observations in the training process or to make an specific data point follow a certain path when making a prediction."

https://towardsdatascience.com/decision-trees-explained-3ec41632ceb6

In [None]:
df = pd.read_csv('../input/learnplatform-covid19-impact-on-digital-learning/districts_info.csv', encoding='utf8')
df.head()

#Handling Missing Values

In [None]:
#Code by Parul Pandey  https://www.kaggle.com/parulpandey/a-guide-to-handling-missing-values-in-python


from sklearn.impute import SimpleImputer
df_most_frequent = df.copy()
#setting strategy to 'mean' to impute by the mean
mean_imputer = SimpleImputer(strategy='most_frequent')# strategy can also be mean or median 
df_most_frequent.iloc[:,:] = mean_imputer.fit_transform(df_most_frequent)

In [None]:
df_most_frequent.isnull().sum()

#Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

#fill in mean for floats
for c in df_most_frequent.columns:
    if df_most_frequent[c].dtype=='float16' or  df_most_frequent[c].dtype=='float32' or  df_most_frequent[c].dtype=='float64':
        df_most_frequent[c].fillna(df[c].mean())

#fill in -999 for categoricals
df_most_frequent = df_most_frequent.fillna(-999)
# Label Encoding
for f in df_most_frequent.columns:
    if df_most_frequent[f].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(df_most_frequent[f].values))
        df_most_frequent[f] = lbl.transform(list(df_most_frequent[f].values))
        
print('Labelling done.')

In [None]:
df_most_frequent.head()

#Heatmap with Brazil's colors

In [None]:
import seaborn as sbn

correlation=df_most_frequent.corr()
plt.figure(figsize=(15,15))
sbn.heatmap(correlation,annot=True,cmap=plt.cm.summer);

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from graphviz import Source


from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split


from scipy.stats import skew
plt.style.use('ggplot')

In [None]:
x = df_most_frequent.drop(['locale', 'pct_black/hispanic'], axis=1)
x.fillna(999999, inplace=True)
y = df_most_frequent['pct_black/hispanic']

In [None]:
dt = DecisionTreeClassifier(max_depth=3)

In [None]:
dt.fit(x, y)

In [None]:
dt_feat = pd.DataFrame(dt.feature_importances_, index=x.columns, columns=['feat_importance'])
dt_feat.sort_values('feat_importance').tail(8).plot.barh(figsize=(14, 6), color='green')
plt.show()

In [None]:
from IPython.display import SVG
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

graph = Source(export_graphviz(dt, out_file=None, feature_names=x.columns, filled = True))
display(SVG(graph.pipe(format='svg')))

In [None]:
x = df_most_frequent.drop(['pp_total_raw', 'pct_free/reduced'], axis=1)
x.fillna(999999, inplace=True)
y = df_most_frequent['pct_free/reduced']

<h1><span class="label label-default" style="background-color:#CD5C5C;border-radius:100px 100px; font-weight: bold; font-family:Garamond; font-size:20px; color:white; padding:10px">A Second Decision Tree</span></h1><br>

In [None]:
dt = DecisionTreeClassifier(max_depth=3)

In [None]:
dt.fit(x, y)

In [None]:
dt_feat = pd.DataFrame(dt.feature_importances_, index=x.columns, columns=['feat_importance'])
dt_feat.sort_values('feat_importance').tail(8).plot.barh(figsize=(14, 6), color='red')
plt.show()

In [None]:
from IPython.display import SVG
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

graph = Source(export_graphviz(dt, out_file=None, feature_names=x.columns, filled = True))
display(SVG(graph.pipe(format='svg')))

In this figure we can observe three kinds of nodes:

"The Root Node: Is the node that starts the graph. In a normal decision tree it evaluates the variable that best splits the data."

"Intermediate nodes: These are nodes where variables are evaluated but which are not the final nodes where predictions are made."

"Leaf nodes: These are the final nodes of the tree, where the predictions of a category or a numerical value are made."

<h1><span class="label label-default" style="background-color:#CD5C5C;border-radius:100px 100px; font-weight: bold; font-family:Garamond; font-size:20px; color:white; padding:10px">Making Predictions</span></h1><br>

"All we have to do is start at the root node, look at the value of the feature that it evaluates, and depending on that value go to the left or right children node."

"This process is repeated until we reach a leaf node. When this happens, depending on whether we are facing a classification or a regression problem two things can happen:

"If we are facing a classification problem, the predicted category would be the mode of the categories on that leaf node"

"For a regression tree, (In our digital learning case above) the prediction we make at the end is the mean of the values for the target variable at such leaf node. If a leaf node had 4 samples with values 20, 18, 22, and 24, then the predicted value at that node would be 21, the mean of the 4 training examples that end there.

https://towardsdatascience.com/decision-trees-explained-3ec41632ceb6