# Intro to Machine Learning

In [61]:
import requests as rq
import pandas as pd
import csv

Data = rq.get("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")

with open('iris.data','w') as f:
     f.write(Data.text)

with open('iris.data') as f:
    reader = csv.reader(f)
    iris_data = [x for x in reader]   
            
iris_DataFrame = pd.DataFrame(iris_data)
iris_DataFrame.columns = ['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Class']
iris_DataFrame

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


# First Machine Learning Model

In [62]:
from sklearn.tree import DecisionTreeRegressor

iris_DataFrame2 = iris_DataFrame.dropna(axis=0)

y = iris_DataFrame2.Class #take the output as the classes
X = iris_DataFrame2.loc[:,['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']] #take all the other columns as features

#####make y an int column 
for i in range(len(y)):
    if y[i] == "Iris-setosa":
        y[i] = 1
    elif y[i] == "Iris-versicolor":
        y[i] = 2
    else:
        y[i] = 3

        
iris_model = DecisionTreeRegressor(random_state=1)
iris_model.fit(X,y) #fit the model

#predictions based on the first 100 elements
iris_model.predict(X.head(100))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])

# Model Validation

In [63]:
from sklearn.model_selection import train_test_split #divide the dataset
from sklearn.tree import DecisionTreeRegressor #the model
from sklearn.metrics import mean_absolute_error #he average of all the absolute errors

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1) # devide the dataset in train and validation

iris_model = DecisionTreeRegressor(random_state = 1) #specify the model
iris_model.fit(train_X, train_y) #fit the model with the training data

val_predictions = iris_model.predict(val_X) #make predictions with the val_X data

val_mae = mean_absolute_error(val_y, val_predictions) #the error of the predictions

val_mae

0.02631578947368421

# Underfitting and Overfitting

In [64]:
#The more leaves we allow the model to make, the more we move from the underfitting area in the above graph 
#to the overfitting area.

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

import numpy as np

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

#will try each value of leaf and return the mean_abs_error of each one
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y): 
    model = DecisionTreeRegressor (max_leaf_nodes = max_leaf_nodes, random_state = 1)
    model.fit(train_X, train_y)
    
    val_predictions = model.predict(val_X)
    val_mae = mean_absolute_error(val_y, val_predictions)
    
    return val_mae

#leafs that i propose
leaf_nodes = [5, 25, 50, 100, 250, 500]
my_mae = np.zeros(len(leaf_nodes))
i = 0

#a loop to use each leaf we propose
for i in range(len(leaf_nodes)):
    my_mae[i] = get_mae(leaf_nodes[i], train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(leaf_nodes[i], my_mae[i]))

best_tree_size = 100 # we need to find the one in the middle  
    
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)
final_model.fit(X, y)


Max leaf nodes: 5  		 Mean Absolute Error:  0
Max leaf nodes: 25  		 Mean Absolute Error:  0
Max leaf nodes: 50  		 Mean Absolute Error:  0
Max leaf nodes: 100  		 Mean Absolute Error:  0
Max leaf nodes: 250  		 Mean Absolute Error:  0
Max leaf nodes: 500  		 Mean Absolute Error:  0


DecisionTreeRegressor(max_leaf_nodes=100, random_state=1)

# Random Forest

In [65]:
#The random forest uses many trees, and it makes a prediction by averaging the predictions of each component tree. 
#It generally has much better predictive accuracy than a single decision tree and it works well with default parameters.

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)

melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))

0.02736842105263158
