# Choosing the right estimator/algorithm for our problems

##### Sklearn uses estimator as another term for machine learning model or algorithm

In [1]:
what_we_are_covering = ['0. An end-to-end Scikit workflow.',
    '1. Getting the Data ready.',
    '2. Choose the right estimator/algorithm for our problems.',
    '3. Fit the model/algorithm and use it to make predictions on our Data.',
    '4. Evaluate a model.',
    '5. Improve a model.',
    '6. Save and load a trained model.',
    '7. Putting it all together']

In [2]:
# Standard imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### * Classification - Predictating whether a sample is one way or another.

### * Regression - Predictating a number 

# Scikit-Learn ML Map

  <img src="ml_map.png">

### 2.1 Picking a machine learning model for a regression problem

In [3]:
# Import Boston housing dataset 

from sklearn.datasets import load_boston

boston = load_boston()
boston;

In [4]:
boston_df = pd.DataFrame(boston['data'], columns=boston['feature_names'])
boston_df['target'] = pd.Series(boston['target'])
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


#### How many samples?

In [5]:
len(boston_df)

506

### Try the RidgeRegression model

In [6]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

# set up random seed

np.random.seed(42)

In [7]:
# Create the data

X = boston_df.drop('target', axis=1)
y = boston_df['target']


X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.2)

# Instantiate Ridge model

model = Ridge()
model.fit(X_train, y_train)


Ridge()

### Check the score on test data

In [8]:
model.score(X_test, y_test)

0.6581117489573478

##### How do we improve this score?

##### What if Ridge wasn't working?

In [9]:
from sklearn.ensemble import RandomForestRegressor

# Set up a random seed

np.random.seed(42)

# Create the data

X = boston_df.drop('target', axis=1)
y = boston_df['target']


# Split the data 

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2)

# Instantiate Random Forest Regressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

# Evaluate the Random Forest Regressor

rf.score(X_test, y_test)

0.7795471407858077

In [10]:
# Check the Ridge model again

model.score(X_test, y_test)

0.6821401869811682

In [11]:
rf.score(X_test, y_test)

0.7795471407858077

## 2.2 Choosing an estimator for a Classification problem

In [12]:
# Choose the Data

heart_disease = pd.read_csv('C:/Users/gthom/sample_project/Data/heart-disease.csv')
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


#### From the start of the Decision tree:
##### Start -> >50 samples ?

In [15]:
len(heart_disease)

303

##### Predicting a category?

###### Yes, 
##### Labeled data?
###### Yes
##### <100k ?
###### Yes

### Linear SVC.

##### Linear Support Vector Classification

In [26]:
from sklearn.svm import LinearSVR

# set up random seed
np.random.seed(42)

# Create the data

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

# Split the data

X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.2)

# Instatiate LinearSVC

clf = LinearSVR()
clf.fit(X_train, y_train)

# Evaluate the LinearSVC

clf.score(X_test, y_test)



0.19570692255217614

In [27]:
from sklearn.ensemble import RandomForestClassifier

# set up random seed
np.random.seed(42)

# Create the data

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

# Split the data

X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.2)

# Instatiate RandomForest

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Evaluate the RandomForest Classifier

clf.score(X_test, y_test)

0.7892561983471075

***

### Tip:
* If you have structured data, use ensemble methods
* if you have unstructured data, use deep learning or transfer learning

***