# Decision Trees

**Importing libraries**

In [158]:
from IPython.display import display

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier

from sklearn.datasets import load_wine

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

**Identifying Outliers**

- can use statistical tests to see if there is statistical significance between your observation and the mean.
- can look at the other data in the column and make estimates based on that
- look at other columns that share common values and make infrence off that
- vizualization
- even using simple methods like linear regression to fill in the missing values of a dataset

**Handling Outliers**

- remove them
- transform them
- A common method is to apply a log transform where you take the logarithm of an outlier to reduce its effect on analysis.

**Missing Values**

- was the value missing because it was not recored
- or becasue it didn't make sense for it to be there

### Outliers and missing data with planets dataset

In [8]:
df = sns.load_dataset('planets')
display(df.head())
display(df.info())

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1035 entries, 0 to 1034
Data columns (total 6 columns):
method            1035 non-null object
number            1035 non-null int64
orbital_period    992 non-null float64
mass              513 non-null float64
distance          808 non-null float64
year              1035 non-null int64
dtypes: float64(3), int64(2), object(1)
memory usage: 48.6+ KB


None

In [9]:
print(df.isnull().sum())

method              0
number              0
orbital_period     43
mass              522
distance          227
year                0
dtype: int64


In [10]:
dropped_rows = df.dropna(axis=0, how='any', inplace=True)
display(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 498 entries, 0 to 784
Data columns (total 6 columns):
method            498 non-null object
number            498 non-null int64
orbital_period    498 non-null float64
mass              498 non-null float64
distance          498 non-null float64
year              498 non-null int64
dtypes: float64(3), int64(2), object(1)
memory usage: 27.2+ KB


None

**Graphing remaining planets values**

In [42]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(go.Scatter(x=df['orbital_period'], y=df['distance'], mode='markers'), row=1, col=1)
fig.add_trace(go.Scatter(x=df['orbital_period'], y=df['mass'], mode='markers'), row=1, col=2)

fig.update_xaxes(title_text="orbital_period", row=1, col=1)
fig.update_xaxes(title_text="orbital_period", row=1, col=2)
fig.update_yaxes(title_text="distance", row=1, col=1)
fig.update_yaxes(title_text="mass", row=1, col=2)

fig.update_layout(width=600)

fig.show()

In [50]:
fig = make_subplots(rows=1, cols=3)

fig.add_trace(go.Box(y=df['orbital_period']), row=1, col=1)
fig.add_trace(go.Box(y=df['distance']), row=1, col=2)
fig.add_trace(go.Box(y=df['mass']), row=1, col=3)

fig.update_yaxes(title_text="orbital_period", row=1, col=1)
fig.update_yaxes(title_text="distance", row=1, col=2)
fig.update_yaxes(title_text="mass", row=1, col=3)

fig.show()

### Pipeline

The scikit-learn pipeline is used to apply a list of preprocessing steps and the final estimator, all in one step. Another important advantage of using a pipeline is that the same steps can be applied to data in the same fold of a cross-validation.

**Implement a Pipeline with penguin dataset**

features:
- species
- bill_length_mm
- bill_depth_mm
- flipper_length_mm
- body_mass_g

target:
- sex

**Loading in dataset and basic analysis**

In [139]:
df = sns.load_dataset('penguins')
df.dropna(inplace=True)
display(df.head())

# Our baseline is a 50/50 guess
print(df['sex'].value_counts(normalize=True))

# Graph basic relations
fig = px.scatter_matrix(df,
    dimensions=['flipper_length_mm', 'body_mass_g', 'sex'],
    color='sex', height=800)
fig.show()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


Male      0.504505
Female    0.495495
Name: sex, dtype: float64


### Creating a pipline with penguin dataset

**Feature Matrix**

- encoding categorical values

In [99]:
# Set-up the one-hot encoder method
categorical_features = ['species']
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder())])

# Set up our preprocessor/column transformer
preprocesser = ColumnTransformer(transformers= [('cat', categorical_transformer, categorical_features)])

# full prediction pipline
clf = Pipeline(steps=[('preprocesser', preprocesser), ('classifier', LogisticRegression())])

**Feature Matrix and Target Array**

In [102]:
features = ['species', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
X = df[features]

# Encode the 'sex' column
le = preprocessing.LabelEncoder()
df['sex_encode'] = le.fit_transform(df['sex'])

# Target array
y = df['sex_encode']

**Applying pipline fitting model**

The logistic regression model does worse than our baseline 50/50 guess. When obseving the graph above this makes sense. This pipline was an example of how to construct a pipline less about beating our baseline score

In [103]:
# Apply the pipeline

# Separate into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Fit the model with our logistic regression classifier
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.429


## Decision Tree Basics

- can be used with numerial data where we minimize the varience
- categorical data where our splits are made with the Gini impurity value

### Decision Tree with Penguin dataset

**initiating the pipline**

In [154]:
# Set-up the one-hot encoder method
categorical_features = ['species']
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder())])

# Set up our preprocessor/column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

# Add the classifier to the preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', DecisionTreeClassifier())])

**following same steps of the pipeline as before but with Decisiontree this time**

- with the encoded species catogory the model is under 50% 
- without the encdoed species catogory the model scores: 0.869

In [156]:
penguins = sns.load_dataset("penguins")
penguins.dropna(inplace=True)

# Select features
features = ['species', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
X = penguins[features]

# Encode the 'sex' column
le = preprocessing.LabelEncoder()
penguins['sex_encode'] = le.fit_transform(penguins['sex'])

# Set target array
y = penguins['sex_encode']

# Apply the pipeline

# Separate into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Fit the model with our logistic regression classifier
pipeline.fit(X_train, y_train)
print("model score: %.3f" % pipeline.score(X_test, y_test))

model score: 0.429


### Decision trees and the wine dataset

- Three classes with samples per class of: [59,71,48]
- Samples total - 178
- Dimensionality - 13
- Features - real, positive

In [254]:
data = load_wine()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = pd.Series(data.target)

display(df.shape)
df.head()

(178, 14)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [262]:
# Separate into features and target
X = df.drop('target', axis=1)
y = df['target']

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Instantiate the classifier
classifier=DecisionTreeClassifier(max_depth=10, criterion='entropy')

# Train the model using the training sets
classifier.fit(X_train,y_train)

# Find the model score
print("Decision tree model score: %.3f" % classifier.score(X_test, y_test))

Decision tree model score: 0.978


**looking at feature importance**

In [216]:
importances = pd.DataFrame(classifier.feature_importances_, X.columns, columns=['value'])
importances.reset_index(inplace=True)
importances = importances.sort_values(by='value', ascending=True)

fig = px.bar(y=importances['index'], x=importances['value'], width=600)
fig.show()

