# Lecture 2
---
This is an example Jupyter notebook for some of the topics covered in lecture 2, part of the _"Data science tools and Machine Learning"_ track. See also the `data-science-tools.ipynb` notebook for a good overview of the available methods in `numpy`, `pandas`, and `matplotlib`.

## 1. Data handling
---

In [None]:
#Turn off some warnings which we can ignore for this example
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Standard import(s)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets

In [None]:
# Load iris dataset 
iris = datasets.load_iris()

In [None]:
# Show available properties of the dataset
iris.keys()

In [None]:
# Show dataset description
print(iris.DESCR)

In [None]:
# Show size of dataset
# The dataset has 150 observations and 4 features
iris.data.shape, iris.target.shape

In [None]:
# Show feature and target names
print("Feature names: {}".format(iris.feature_names))
print("Target names:  {}".format(iris.target_names))

In [None]:
# Create pandas.DataFrame
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

In [None]:
# Add new column(s) for target (type)
df['type']   = iris.target_names[iris.target]
df['target'] = iris.target
df.head()

In [None]:
# In cases with incomplete and/or corrupted data, it may be necessary to do some
# cleaning.
# -- Remove duplicates *if* you have reason to suspect that repeated, identical 
#    entries are pathological (they might not be)
# Drop duplicates and null values 
df = df.drop_duplicates().dropna()

# -- Remove rows with 'not-a-number' in the features
nan = np.any(np.isnan(df[iris.feature_names]), axis=1)
df  = df[~nan]

print ("Shape after cleaning: {}".format(df.shape))
df.head()

In [None]:
# Save to, and reload from, CSV file
df.to_csv("iris.csv")

# ...

df2 = pd.read_csv("iris.csv", index_col=0)
df2.head()

In [None]:
# Get summary statistics
df.describe()

In [None]:
# Access column two ways
df.type
df['type']

In [None]:
# List subset of columns
df[['sepal width (cm)', 'petal width (cm)', 'type']].head(10)

In [None]:
# List subset of rows by row number
df[10:15]

In [None]:
# ... or, equivalently
df.iloc[10:15]

In [None]:
# Sometimes, row number and index differ (e.g. when shuffling)
df = df.sample(frac=1, replace=False, random_state=1234)
df.head(15)

In [None]:
# This illustrates the difference in accessing row by row number 
df.iloc[10:15]

In [None]:
# ... or by *index*
df.loc[10:15]

In [None]:
# find the measurement with target > 0 and the type for these measurements. Find the unique such type
df.query('target  > 0').type.unique()

In [None]:
# Count number of rows in each class
df.groupby('type').count()

In [None]:
# Sort the rows (highest values -> ascending=False) by sepal length showing only the relevant columns (sepal width and petal lenght) and the first 2 rows
df.sort_values(by=['sepal length (cm)'], ascending=False)[['sepal width (cm)', 'petal length (cm)']].head(2)

In [None]:
# Grop data by type and get the mean of sepal length for each group, and sort by descending value
df.groupby('type')['sepal length (cm)'].mean().sort_values(ascending=False)

In [None]:
# Select only events with sepal length greater than 6.0 and count them grouped by type
mask = df['sepal length (cm)'] > 6.0
df[mask].groupby('type').count()

In [None]:
# Convert pandas.DataFrame to numpy.array
array = df['type'].values
array

In [None]:
# Convert numpy.array to python list
array.tolist()

## 2.  Visualisation
---

In [None]:
# Feature to plot
feat = 'petal width (cm)'

# Bin range
bins = np.linspace(0, 3, 15 + 1, endpoint=True)

# Create figure and axis objects.
fig, ax = plt.subplots()

for t in iris.target_names:
    # Boolean mask
    mask = df['type'] == t
    
    # Make histogram for current type
    ax.hist(df[mask][feat], bins=bins, alpha=0.5, label=t)
    pass

# Decorations
ax.legend()
ax.set_xlabel(feat)
ax.set_ylabel("Number of entries")
fig.show()

In [None]:
# Features to plot
featx = 'sepal length (cm)'
featy = 'sepal width (cm)'

# Create figure and axis objects.
fig, ax = plt.subplots()

for t in iris['target_names']:
    # Boolean mask
    mask = df['type'] == t

    # Scatter plot for current type
    ax.scatter(df[mask][featx], df[mask][featy], label=t, alpha=0.5)
    pass

# Draw legend
ax.legend()
ax.set_xlabel(featx)
ax.set_ylabel(featy)
fig.show()

In [None]:
sns.pairplot(df, hue = 'type');

## 4. Decision Trees
---

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics # Import scikit-learn metrics module for accuracy calculation
from sklearn import preprocessing # Import preprocessing for String-Int conversion

In [None]:
X = iris.data[:, 2:] # only focus on petal length and width
Y = iris.target
feature_names = iris.feature_names[2:]
print("given:",feature_names, 
      "\npredict whether:", iris.target_names)

In [None]:
# use matplotlib as you did on previous labs
import matplotlib.pyplot as plt

color_map = ["yo", "bs", "g^"]
for target_index, target_name in enumerate(iris.target_names):
    plt.plot(X[:, 0][Y==target_index], # petal length on X axis (the ones that equal to target)
             X[:, 1][Y==target_index], # petal width on Y axis (the ones that equal to target)
             color_map[target_index], 
             label=target_name)
plt.xlabel("petal length")
plt.ylabel("petal width")
plt.legend()
plt.show()

#### Split of the Dataset

* Split the dataset into training and test set is fundamental. Usually the training set consists of 70% of data and the test set of 30%. (test_size option on the train_test_split function is used for the percentage of splitting).
* After the split, the train set (x_train) is used for the training of the algorithm.
* After the training, the test set (y_test) is used to predict the outcome of unseen data.
* The accuracy_score function is then used to give an estimate of the accuracy of the prediction.

In [None]:
# split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=1) # 70% training and 30% test

# Create Decision Tree classifer object with these parameters
dt = DecisionTreeClassifier(criterion = 'entropy',max_depth = 3)
# Train Decision Tree Classifer
dt = dt.fit(x_train,y_train)
# Predict the response for test dataset
y_pred = dt.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(dt,
                out_file="iris_tree.dot",
                rounded=True,
                filled=True,
                feature_names=iris.feature_names
               )


In [None]:
#save the decision_tree as png
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt, 
                   filled=True,
                  feature_names=iris.feature_names, rounded=True, class_names=list(iris.target_names))
fig.savefig("decision_tree.png")

Feature importance from the DT is calculated during the training (`feature_importances_`) and encodes how successful each feature is at splitting the samples.

In [None]:
isort = np.argsort(dt.feature_importances_)
forest_importances = pd.Series(dt.feature_importances_[isort], index=np.array(iris.feature_names)[isort])

fig, ax = plt.subplots()
forest_importances.plot.bar( ax=ax)
ax.set_title("Feature importances")
ax.set_ylabel("Feature importance")
fig.tight_layout()


# Ensamble methods

## Boosting: GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [None]:
# here we use the Gradient boosting classifier
n_estimators = 20

bdt = GradientBoostingClassifier(max_depth=3, n_estimators=n_estimators)
bdt.fit(x_train, y_train)
y_pred = bdt.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

## Bagging: Random forest

In [None]:
# Fit a random forest classifier
rf = RandomForestClassifier(n_estimators=20, max_depth=4)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))