In [None]:
# Project 2: Supervised Learning
## Building a Student Intervention System

### Install

This project requires **Python 2.7** and the following Python libraries installed:

- [NumPy](http://www.numpy.org/)
- [Pandas](http://pandas.pydata.org)
- [scikit-learn](http://scikit-learn.org/stable/)

You will also need to have software installed to run and execute an [iPython Notebook](http://ipython.org/notebook.html)

Udacity recommends our students install [Anaconda](https://www.continuum.io/downloads), a pre-packaged Python distribution that contains all of the necessary libraries and software for this project. 

### Code

Template code is provided in the notebook `student_intervention.ipynb` notebook file. While some code has already been implemented to get you started, you will need to implement additional functionality when requested to successfully complete the project.

### Run

In a terminal or command window, navigate to the top-level project directory `student_intervention/` (that contains this README) and run one of the following commands:

```ipython notebook student_intervention.ipynb```  
```jupyter notebook student_intervention.ipynb```

This will open the iPython Notebook software and project file in your browser.

## Data

The dataset used in this project is included as `student-data.csv`. This dataset has the following attributes:

- `school` : student's school (binary: "GP" or "MS")
- `sex` : student's sex (binary: "F" - female or "M" - male)
- `age` : student's age (numeric: from 15 to 22)
- `address` : student's home address type (binary: "U" - urban or "R" - rural)
- `famsize` : family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3)
- `Pstatus` : parent's cohabitation status (binary: "T" - living together or "A" - apart)
- `Medu` : mother's education (numeric: 0 - none,  1 - primary education (4th grade), 2 - 5th to 9th grade, 3 - secondary education or 4 - higher education)
- `Fedu` : father's education (numeric: 0 - none,  1 - primary education (4th grade), 2 - 5th to 9th grade, 3 - secondary education or 4 - higher education)
- `Mjob` : mother's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")
- `Fjob` : father's job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")
- `reason` : reason to choose this school (nominal: close to "home", school "reputation", "course" preference or "other")
- `guardian` : student's guardian (nominal: "mother", "father" or "other")
- `traveltime` : home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)
- `studytime` : weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)
- `failures` : number of past class failures (numeric: n if 1<=n<3, else 4)
- `schoolsup` : extra educational support (binary: yes or no)
- `famsup` : family educational support (binary: yes or no)
- `paid` : extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)
- `activities` : extra-curricular activities (binary: yes or no)
- `nursery` : attended nursery school (binary: yes or no)
- `higher` : wants to take higher education (binary: yes or no)
- `internet` : Internet access at home (binary: yes or no)
- `romantic` : with a romantic relationship (binary: yes or no)
- `famrel` : quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
- `freetime` : free time after school (numeric: from 1 - very low to 5 - very high)
- `goout` : going out with friends (numeric: from 1 - very low to 5 - very high)
- `Dalc` : workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
- `Walc` : weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
- `health` : current health status (numeric: from 1 - very bad to 5 - very good)
- `absences` : number of school absences (numeric: from 0 to 93)
- `passed` : did the student pass the final exam (binary: yes or no)


In [2]:
import os
os.getcwd()

'E:\\Study\\Data Analytics\\_Courses\\Udacity-MLND\\machine-learning-master-finished\\projects\\student_intervention'

In [3]:
# Import libraries
import numpy as np
import pandas as pd
from time import time
from sklearn.metrics import f1_score

# Read student data
student_data = pd.read_csv("student-data.csv")
print ("Student data read successfully!")

Student data read successfully!


In [14]:
# TODO: Calculate number of students
n_students = len(student_data)
n_students = student_data.shape[0]

# TODO: Calculate number of features
n_features = len(student_data.columns)-1
n_features = student_data.shape[1]-1

# TODO: Calculate passing students
n_passed = len(student_data.loc[student_data['passed'] == 'yes'])
n_passed = len(student_data[['passed']].query('passed=="yes"'))
n_passed = len(student_data[student_data.passed=='yes'])

# TODO: Calculate failing students
n_failed = len(student_data.loc[student_data['passed'] == 'no'])
n_failed = len(student_data[['passed']].query('passed=="no"'))
n_failed = len(student_data[student_data.passed=='no'])

# TODO: Calculate graduation rate
grad_rate = 0.0
grad_rate = np.divide(float(n_passed),n_students)*100


print n_passed
print n_students
print np.divide(n_passed,n_students)

# Print the results
print ("Total number of students: {}".format(n_students))
print ("Number of features: {}".format(n_features))
print ("Number of students who passed: {}".format(n_passed))
print ("Number of students who failed: {}".format(n_failed))
print ("Graduation rate of the class: {:.2f}%".format(grad_rate))

265
395
0
Total number of students: 395
Number of features: 30
Number of students who passed: 265
Number of students who failed: 130
Graduation rate of the class: 67.09%


In [4]:
# Extract feature columns
feature_cols = list(student_data.columns[:-1])

# Extract target column 'passed'
target_col = student_data.columns[-1] 

# Show the list of columns
print ("Feature columns:\n{}".format(feature_cols))
print ("\nTarget column: {}".format(target_col))

# Separate the data into feature data and target data (X_all and y_all, respectively)
X_all = student_data[feature_cols]
y_all = student_data[target_col]

# Show the feature information by printing the first five rows
print ("\nFeature values:")
print (X_all.head())

Feature columns:
['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']

Target column: passed

Feature values:
  school sex  age address famsize Pstatus  Medu  Fedu     Mjob      Fjob  \
0     GP   F   18       U     GT3       A     4     4  at_home   teacher   
1     GP   F   17       U     GT3       T     1     1  at_home     other   
2     GP   F   15       U     LE3       T     1     1  at_home     other   
3     GP   F   15       U     GT3       T     4     2   health  services   
4     GP   F   16       U     GT3       T     3     3    other     other   

    ...    higher internet  romantic  famrel  freetime goout Dalc Walc health  \
0   ...       yes       no        no       4         3     4    1    1      3   
1   ...       

In [5]:
def preprocess_features(X):
    ''' Preprocesses the student data and converts non-numeric binary variables into
        binary (0/1) variables. Converts categorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():
        
        # If data type is non-numeric, replace all yes/no values with 1/0
        if col_data.dtype == object:
            col_data = col_data.replace(['yes', 'no'], [1, 0])

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            # Example: 'school' => 'school_GP' and 'school_MS'
            col_data = pd.get_dummies(col_data, prefix = col)  
        
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

X_all = preprocess_features(X_all)
print ("Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns)))

Processed feature columns (48 total features):
['school_GP', 'school_MS', 'sex_F', 'sex_M', 'age', 'address_R', 'address_U', 'famsize_GT3', 'famsize_LE3', 'Pstatus_A', 'Pstatus_T', 'Medu', 'Fedu', 'Mjob_at_home', 'Mjob_health', 'Mjob_other', 'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health', 'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course', 'reason_home', 'reason_other', 'reason_reputation', 'guardian_father', 'guardian_mother', 'guardian_other', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']


In [6]:
# TODO: Import any additional functionality you may need here
from sklearn import cross_validation

# TODO: Set the number of training points
num_train = 300

# Set the number of testing points
num_test = X_all.shape[0] - num_train

# TODO: Shuffle and split the dataset into the number of training and testing points above
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_all, y_all, test_size=num_test, random_state=0)

#X_train = None
#X_test = None
#y_train = None
#y_test = None

# Show the results of the split
print ("Training set has {} samples.".format(X_train.shape[0]))
print ("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 300 samples.
Testing set has 95 samples.


In [24]:
print (X_train.head())

     school_GP  school_MS  sex_F  sex_M  age  address_R  address_U  \
63         1.0        0.0    1.0    0.0   16        0.0        1.0   
245        1.0        0.0    0.0    1.0   16        0.0        1.0   
154        1.0        0.0    1.0    0.0   17        0.0        1.0   
311        1.0        0.0    1.0    0.0   19        0.0        1.0   
81         1.0        0.0    0.0    1.0   15        0.0        1.0   

     famsize_GT3  famsize_LE3  Pstatus_A    ...     higher  internet  \
63           1.0          0.0        0.0    ...          1         1   
245          1.0          0.0        0.0    ...          1         1   
154          1.0          0.0        0.0    ...          1         0   
311          1.0          0.0        0.0    ...          0         1   
81           1.0          0.0        0.0    ...          1         1   

     romantic  famrel  freetime  goout  Dalc  Walc  health  absences  
63          0       3         4      4     2     4       4         2  
245 

In [25]:
def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print ("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    end = time()
    
    # Print and return results
    print ("Made predictions in {:.4f} seconds.".format(end - start))
    return f1_score(target.values, y_pred, pos_label='yes')


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print ("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    print ("F1 score for training set: {:.4f}.".format(predict_labels(clf, X_train, y_train)))
    print ("F1 score for test set: {:.4f}.".format(predict_labels(clf, X_test, y_test)))

In [8]:
def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print ("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    start = time()
    y_pred = clf.predict(features)
    end = time()
    
    # Print and return results
    print ("Made predictions in {:.4f} seconds.".format(end - start))
    return f1_score(target.values, y_pred, pos_label='yes')


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    #print ("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    #print ("F1 score for training set: {:.4f}.".format(predict_labels(clf, X_train, y_train)))
    print ("{} F1 score for test set: {:.4f}.".format(clf.__class__.__name__,predict_labels(clf, X_test, y_test)))

In [9]:
X_train_100, X_test_100, y_train_100, y_test_100 = cross_validation.train_test_split(X_all, y_all, test_size=X_all.shape[0]-100, random_state=0)
#X_train_100 = None
#y_train_100 = None
print ("Training set has {} samples.".format(X_train_100.shape[0]))
print ("Testing set has {} samples.".format(X_test_100.shape[0]))

X_train_200, X_test_200, y_train_200, y_test_200 = cross_validation.train_test_split(X_all, y_all, test_size=X_all.shape[0]-200, random_state=0)
#X_train_200 = None
#y_train_200 = None
print ("Training set has {} samples.".format(X_train_200.shape[0]))
print ("Testing set has {} samples.".format(X_test_200.shape[0]))

X_train_300, X_test_300, y_train_300, y_test_300 = cross_validation.train_test_split(X_all, y_all, test_size=X_all.shape[0]-300, random_state=0)
#X_train_300 = None
#y_train_300 = None
print ("Training set has {} samples.".format(X_train_300.shape[0]))
print ("Testing set has {} samples.".format(X_test_300.shape[0]))

Training set has 100 samples.
Testing set has 295 samples.
Training set has 200 samples.
Testing set has 195 samples.
Training set has 300 samples.
Testing set has 95 samples.


In [107]:
X_train_100 = X_train[0:100]
y_train_100 = y_train[0:100]
print ("Training set has {} samples.".format(X_train_100.shape[0]))
print ("Testing set has {} samples.".format(y_train_100.shape[0]))
X_train_200 = X_train[0:200]
y_train_200 = y_train[0:200]
print ("Training set has {} samples.".format(X_train_200.shape[0]))
print ("Testing set has {} samples.".format(y_train_200.shape[0]))
X_train_300 = X_train
y_train_300 = y_train
print ("Training set has {} samples.".format(X_train_300.shape[0]))
print ("Testing set has {} samples.".format(y_train_300.shape[0]))

Training set has 100 samples.
Testing set has 100 samples.
Training set has 200 samples.
Testing set has 200 samples.
Training set has 300 samples.
Testing set has 300 samples.


In [108]:
# TODO: Import the three supervised learning models from sklearn
from sklearn import tree
from sklearn import svm
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
#from sklearn.neural_network import MLPClassifier

# TODO: Initialize the three models

clf_A = tree.DecisionTreeClassifier()
clf_B = linear_model.LogisticRegression()
clf_C = svm.SVC()
clf_D = naive_bayes.GaussianNB() 
clf_E = neighbors.KNeighborsClassifier()
clf_F = linear_model.SGDClassifier()
clf_G1 = BaggingClassifier(KNeighborsClassifier())
clf_G2 = BaggingClassifier(linear_model.SGDClassifier())
clf_H = RandomForestClassifier()
clf_I = ExtraTreesClassifier()
clf_J = GradientBoostingClassifier()
clf_K = AdaBoostClassifier()
#clf_L = MLPClassifier(algorithm='l-bfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

# TODO: Set up the training set sizes
#X_train_100, X_test_100, y_train_100, y_test_100 = cross_validation.train_test_split(X_all, y_all, test_size=X_all.shape[0]-100, random_state=0)
#X_train_100 = None
#y_train_100 = None

#X_train_200, X_test_200, y_train_200, y_test_200 = cross_validation.train_test_split(X_all, y_all, test_size=X_all.shape[0]-200, random_state=0)
#X_train_200 = None
#y_train_200 = None

#X_train_300, X_test_300, y_train_300, y_test_300 = cross_validation.train_test_split(X_all, y_all, test_size=X_all.shape[0]-300, random_state=0)
#X_train_300 = None
#y_train_300 = None

# TODO: Execute the 'train_predict' function for each classifier and each training set size
# train_predict(clf, X_train, y_train, X_test, y_test)
train_predict(clf_A, X_train_300, y_train_300, X_test, y_test)

train_predict(clf_B, X_train_300, y_train_300, X_test, y_test)

train_predict(clf_C, X_train_300, y_train_300, X_test, y_test)

train_predict(clf_D, X_train_300, y_train_300, X_test, y_test)

train_predict(clf_E, X_train_300, y_train_300, X_test, y_test)

train_predict(clf_F, X_train_300, y_train_300, X_test, y_test)

train_predict(clf_G1, X_train_300, y_train_300, X_test, y_test)

train_predict(clf_G2, X_train_300, y_train_300, X_test, y_test)

train_predict(clf_H, X_train_300, y_train_300, X_test, y_test)

train_predict(clf_I, X_train_300, y_train_300, X_test, y_test)

train_predict(clf_J, X_train_300, y_train_300, X_test, y_test)

train_predict(clf_K, X_train_300, y_train_300, X_test, y_test)


Training a DecisionTreeClassifier using a training set size of 300. . .
Trained model in 0.0030 seconds
Made predictions in 0.0010 seconds.
F1 score for training set: 1.0000.
Made predictions in 0.0010 seconds.
F1 score for test set: 0.7119.
Training a LogisticRegression using a training set size of 300. . .
Trained model in 0.0050 seconds
Made predictions in 0.0000 seconds.
F1 score for training set: 0.8381.
Made predictions in 0.0000 seconds.
F1 score for test set: 0.7910.
Training a SVC using a training set size of 300. . .
Trained model in 0.0110 seconds
Made predictions in 0.0090 seconds.
F1 score for training set: 0.8692.
Made predictions in 0.0030 seconds.
F1 score for test set: 0.7586.
Training a GaussianNB using a training set size of 300. . .
Trained model in 0.0020 seconds
Made predictions in 0.0000 seconds.
F1 score for training set: 0.8088.
Made predictions in 0.0010 seconds.
F1 score for test set: 0.7500.
Training a KNeighborsClassifier using a training set size of 300. .

In [25]:
# TODO: Import the three supervised learning models from sklearn
from sklearn import tree
from sklearn import svm
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import neighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
#from sklearn.neural_network import MLPClassifier

# TODO: Initialize the three models

clf_A = tree.DecisionTreeClassifier(criterion='entropy',random_state=0)
clf_B = linear_model.LogisticRegression(C=1e5)
clf_C = svm.SVC(kernel='rbf')
clf_D = naive_bayes.GaussianNB() 
clf_E = neighbors.KNeighborsClassifier()
clf_F = linear_model.SGDClassifier()
clf_G1 = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)
clf_G2 = BaggingClassifier(linear_model.SGDClassifier(), max_samples=0.5, max_features=0.5)
clf_H = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0)
clf_I = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0)
clf_J = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
clf_K = AdaBoostClassifier(n_estimators=100)
#clf_L = MLPClassifier(algorithm='l-bfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

# TODO: Set up the training set sizes
X_train_100 = X_train[0:100]
y_train_100 = y_train[0:100]

X_train_200 = X_train[0:200]
y_train_200 = y_train[0:200]

X_train_300 = X_train
y_train_300 = y_train

# TODO: Execute the 'train_predict' function for each classifier and each training set size
# train_predict(clf, X_train, y_train, X_test, y_test)
train_predict(clf_A, X_train_100, y_train_100, X_test_100, y_test_100)
train_predict(clf_A, X_train_200, y_train_200, X_test_200, y_test_200)
train_predict(clf_A, X_train_300, y_train_300, X_test_300, y_test_300)

train_predict(clf_B, X_train_100, y_train_100, X_test_100, y_test_100)
train_predict(clf_B, X_train_200, y_train_200, X_test_200, y_test_200)
train_predict(clf_B, X_train_300, y_train_300, X_test_300, y_test_300)

train_predict(clf_C, X_train_100, y_train_100, X_test_100, y_test_100)
train_predict(clf_C, X_train_200, y_train_200, X_test_200, y_test_200)
train_predict(clf_C, X_train_300, y_train_300, X_test_300, y_test_300)

train_predict(clf_D, X_train_100, y_train_100, X_test_100, y_test_100)
train_predict(clf_D, X_train_200, y_train_200, X_test_200, y_test_200)
train_predict(clf_D, X_train_300, y_train_300, X_test_300, y_test_300)

train_predict(clf_E, X_train_100, y_train_100, X_test_100, y_test_100)
train_predict(clf_E, X_train_200, y_train_200, X_test_200, y_test_200)
train_predict(clf_E, X_train_300, y_train_300, X_test_300, y_test_300)

train_predict(clf_F, X_train_100, y_train_100, X_test_100, y_test_100)
train_predict(clf_F, X_train_200, y_train_200, X_test_200, y_test_200)
train_predict(clf_F, X_train_300, y_train_300, X_test_300, y_test_300)

train_predict(clf_G1, X_train_100, y_train_100, X_test_100, y_test_100)
train_predict(clf_G1, X_train_200, y_train_200, X_test_200, y_test_200)
train_predict(clf_G1, X_train_300, y_train_300, X_test_300, y_test_300)

train_predict(clf_G2, X_train_100, y_train_100, X_test_100, y_test_100)
train_predict(clf_G2, X_train_200, y_train_200, X_test_200, y_test_200)
train_predict(clf_G2, X_train_300, y_train_300, X_test_300, y_test_300)

train_predict(clf_H, X_train_100, y_train_100, X_test_100, y_test_100)
train_predict(clf_H, X_train_200, y_train_200, X_test_200, y_test_200)
train_predict(clf_H, X_train_300, y_train_300, X_test_300, y_test_300)

train_predict(clf_I, X_train_100, y_train_100, X_test_100, y_test_100)
train_predict(clf_I, X_train_200, y_train_200, X_test_200, y_test_200)
train_predict(clf_I, X_train_300, y_train_300, X_test_300, y_test_300)

train_predict(clf_J, X_train_100, y_train_100, X_test_100, y_test_100)
train_predict(clf_J, X_train_200, y_train_200, X_test_200, y_test_200)
train_predict(clf_J, X_train_300, y_train_300, X_test_300, y_test_300)

train_predict(clf_K, X_train_100, y_train_100, X_test_100, y_test_100)
train_predict(clf_K, X_train_200, y_train_200, X_test_200, y_test_200)
train_predict(clf_K, X_train_300, y_train_300, X_test_300, y_test_300)

#train_predict(clf_L, X_train_100, y_train_100, X_test_100, y_test_100)
#train_predict(clf_L, X_train_200, y_train_200, X_test_200, y_test_200)
#train_predict(clf_L, X_train_300, y_train_300, X_test_300, y_test_300)

Trained model in 0.0040 seconds
Made predictions in 0.0010 seconds.
F1 score for test set: 0.7692.
Trained model in 0.0040 seconds
Made predictions in 0.0010 seconds.
F1 score for test set: 0.6718.
Trained model in 0.0020 seconds
Made predictions in 0.0010 seconds.
F1 score for test set: 0.7273.
Trained model in 0.0080 seconds
Made predictions in 0.0000 seconds.
F1 score for test set: 0.7647.
Trained model in 0.0080 seconds
Made predictions in 0.0000 seconds.
F1 score for test set: 0.7549.
Trained model in 0.0160 seconds
Made predictions in 0.0010 seconds.
F1 score for test set: 0.6176.
Trained model in 0.0100 seconds
Made predictions in 0.0030 seconds.
F1 score for test set: 0.7586.
Trained model in 0.0060 seconds
Made predictions in 0.0030 seconds.
F1 score for test set: 0.7781.
Trained model in 0.0010 seconds
Made predictions in 0.0020 seconds.
F1 score for test set: 0.7824.
Trained model in 0.0020 seconds
Made predictions in 0.0000 seconds.
F1 score for test set: 0.7500.
Trained mo

In [26]:
#tuning decisiontree
clf_A = tree.DecisionTreeClassifier(criterion='gini',max_leaf_nodes=4, max_depth=1, random_state=0)
train_predict(clf_A, X_train, y_train, X_test, y_test)

Training a DecisionTreeClassifier using a training set size of 300. . .
Trained model in 0.0030 seconds
Made predictions in 0.0000 seconds.
F1 score for training set: 0.8295.
Made predictions in 0.0010 seconds.
F1 score for test set: 0.7852.


In [27]:
clf_A = tree.DecisionTreeClassifier(criterion='entropy',max_leaf_nodes=3, max_depth=2, random_state=0)
train_predict(clf_A, X_train, y_train, X_test, y_test)

Training a DecisionTreeClassifier using a training set size of 300. . .
Trained model in 0.0020 seconds
Made predictions in 0.0000 seconds.
F1 score for training set: 0.8270.
Made predictions in 0.0000 seconds.
F1 score for test set: 0.7971.


In [28]:
# TODO: Import 'GridSearchCV' and 'make_scorer'
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV

# TODO: Create the parameters list you wish to tune
parameters = {'criterion': ['gini','entropy']}

# TODO: Initialize the classifier
clf = tree.DecisionTreeClassifier(random_state=0)

# TODO: Make an f1 scoring function using 'make_scorer' 
from sklearn.metrics import f1_score
f1_scorer = make_scorer(f1_score, pos_label="yes")

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(estimator=clf, param_grid=parameters, scoring=f1_scorer)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print ("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print ("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))

print ("Parameter 'criterion' is {} for the optimal model.".format(clf.get_params()['criterion']))

Made predictions in 0.0010 seconds.
Tuned model has a training F1 score of 1.0000.
Made predictions in 0.0000 seconds.
Tuned model has a testing F1 score of 0.7692.
Parameter 'criterion' is entropy for the optimal model.


In [29]:
# TODO: Import 'GridSearchCV' and 'make_scorer'
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV

# TODO: Create the parameters list you wish to tune
parameters = {'max_depth': np.arange(1, 21)}

# TODO: Initialize the classifier
clf = tree.DecisionTreeClassifier(criterion='entropy',random_state=0)

# TODO: Make an f1 scoring function using 'make_scorer' 
from sklearn.metrics import f1_score
f1_scorer = make_scorer(f1_score, pos_label="yes")
#f1_score(target.values, y_pred, pos_label='yes')
#Create the F1 scoring function using make_scorer and store it in f1_scorer.
#Set the pos_label parameter to the correct value!
#clf.fit(X_train, y_train)
#y_pred = clf.predict(X_test)

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(estimator=clf, param_grid=parameters, scoring=f1_scorer)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print ("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print ("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))

#print ("Parameter 'max_depth' is {} for the optimal model.".format(clf.get_params()['max_depth']))

Made predictions in 0.0000 seconds.
Tuned model has a training F1 score of 0.8326.
Made predictions in 0.0000 seconds.
Tuned model has a testing F1 score of 0.7941.


In [31]:
# TODO: Import 'GridSearchCV' and 'make_scorer'
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV

# TODO: Create the parameters list you wish to tune
parameters = {'max_leaf_nodes': np.arange(2, 21)}

# TODO: Initialize the classifier
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=2, random_state=0)

# TODO: Make an f1 scoring function using 'make_scorer' 
from sklearn.metrics import f1_score
f1_scorer = make_scorer(f1_score, pos_label="yes")

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(estimator=clf, param_grid=parameters, scoring=f1_scorer)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print ("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print ("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))

print ("Parameter 'max_leaf_nodes' is {} for the optimal model.".format(clf.get_params()['max_leaf_nodes']))

Made predictions in 0.0000 seconds.
Tuned model has a training F1 score of 0.8270.
Made predictions in 0.0000 seconds.
Tuned model has a testing F1 score of 0.7971.
Parameter 'max_leaf_nodes' is 3 for the optimal model.


In [109]:
# TODO: Import 'GridSearchCV' and 'make_scorer'
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV

# TODO: Create the parameters list you wish to tune
param_range= np.arange(1, 21)
parameters = [{'criterion': ['gini'], 'max_depth': param_range, 'max_leaf_nodes': param_range[1:]},
              {'criterion': ['entropy'], 'max_depth': param_range, 'max_leaf_nodes': param_range[1:]},
             ]

# TODO: Initialize the classifier
clf = tree.DecisionTreeClassifier(random_state=0)

# TODO: Make an f1 scoring function using 'make_scorer' 
from sklearn.metrics import f1_score
f1_scorer = make_scorer(f1_score, pos_label="yes")

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(estimator=clf, param_grid=parameters, scoring=f1_scorer)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print ("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print ("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))

print ("Parameter 'criterion' is {} for the optimal model.".format(clf.get_params()['criterion']))
print (grid_obj.best_params_)

Made predictions in 0.0000 seconds.
Tuned model has a training F1 score of 0.8295.
Made predictions in 0.0000 seconds.
Tuned model has a testing F1 score of 0.7852.
Parameter 'criterion' is gini for the optimal model.
{'max_leaf_nodes': 4, 'criterion': 'gini', 'max_depth': 1}


In [41]:
#tuning SVM
clf_A = svm.SVC(kernel='rbf', C=1, gamma=0.1, decision_function_shape='ovo')
train_predict(clf_A, X_train, y_train, X_test, y_test)

Training a SVC using a training set size of 300. . .
Trained model in 0.0150 seconds
Made predictions in 0.0090 seconds.
F1 score for training set: 0.9717.
Made predictions in 0.0030 seconds.
F1 score for test set: 0.7919.


In [39]:
# TODO: Import 'GridSearchCV' and 'make_scorer'
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV

# TODO: Create the parameters list you wish to tune
param_range= [1e-2,1e-1,1,1e1,1e2]
degree_range= [2,3,4]
parameters = [{'kernel': ['linear'],'C':param_range, 'decision_function_shape':['ovo','ovr','None']},
              {'kernel': ['rbf'],'C':param_range,'gamma':param_range, 'decision_function_shape':['ovo','ovr','None']},
              {'kernel': ['sigmoid'],'C':param_range,'gamma':param_range, 'decision_function_shape':['ovo','ovr','None']},
              {'kernel': ['poly'],'degree':degree_range,'gamma':param_range, 'decision_function_shape':['ovo','ovr','None']}
             ]

# TODO: Initialize the classifier
clf = svm.SVC(random_state=1)

# TODO: Make an f1 scoring function using 'make_scorer' 
from sklearn.metrics import f1_score
f1_scorer = make_scorer(f1_score, pos_label="yes")

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(estimator=clf, param_grid=parameters, scoring=f1_scorer, cv=10, n_jobs=-1)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print ("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print ("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))

print ("Parameter 'kernel' is {} for the optimal model.".format(clf.get_params()['kernel']))
print (grid_obj.best_params_)

Made predictions in 0.0100 seconds.
Tuned model has a training F1 score of 0.9717.
Made predictions in 0.0030 seconds.
Tuned model has a testing F1 score of 0.7919.
Parameter 'kernel' is rbf for the optimal model.
{'kernel': 'rbf', 'C': 1, 'decision_function_shape': 'ovo', 'gamma': 0.1}


In [111]:
#tuning LogisticRegression
clf_A = linear_model.LogisticRegression(penalty='l2', C=0.1, tol=1e-5, solver='liblinear')
train_predict(clf_A, X_train, y_train, X_test, y_test)

Training a LogisticRegression using a training set size of 300. . .
Trained model in 0.0050 seconds
Made predictions in 0.0000 seconds.
F1 score for training set: 0.8326.
Made predictions in 0.0010 seconds.
F1 score for test set: 0.7857.


In [110]:
# TODO: Import 'GridSearchCV' and 'make_scorer'
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV

# TODO: Create the parameters list you wish to tune
param_range= [1e-5,1e-4,1e-3,1e-2,1e-1,1,1e1,1e2,1e3,1e4,1e5]
parameters = [{'penalty': ['l1'],'C':param_range, 'tol':param_range},
              {'penalty': ['l2'],'C':param_range, 'tol':param_range},
             ]

# TODO: Initialize the classifier
clf = linear_model.LogisticRegression(random_state=1)

# TODO: Make an f1 scoring function using 'make_scorer' 
from sklearn.metrics import f1_score
f1_scorer = make_scorer(f1_score, pos_label="yes")

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(estimator=clf, param_grid=parameters, scoring=f1_scorer, cv=10, n_jobs=-1)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print ("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print ("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))

print ("Parameter 'penalty' is {} for the optimal model.".format(clf.get_params()['penalty']))
print (grid_obj.best_params_)

Made predictions in 0.0120 seconds.
Tuned model has a training F1 score of 0.8267.
Made predictions in 0.0140 seconds.
Tuned model has a testing F1 score of 0.7917.
Parameter 'penalty' is l1 for the optimal model.
{'penalty': 'l1', 'C': 0.1, 'tol': 1e-05}


In [118]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
classifier = linear_model.LogisticRegression(penalty='l2', C=0.1, tol=1e-5, solver='liblinear')
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# Compute Precision-Recall and plot curve
precision = dict()
recall = dict()
average_precision = dict()
precision, recall, _ = precision_recall_curve(y_test,y_score, pos_label="yes")
average_precision = average_precision_score(y_test, y_score)


# Compute micro-average ROC curve and ROC area
precision["micro"], recall["micro"], _ = precision_recall_curve(y_test.ravel(),y_score.ravel())
average_precision["micro"] = average_precision_score(y_test, y_score, average="micro")

# Plot Precision-Recall curve
plt.clf()
plt.plot(recall[0], precision[0], label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall example: AUC={0:0.2f}'.format(average_precision[0]))
plt.legend(loc="lower left")
plt.show()


ValueError: Data is not binary and pos_label is not specified

In [58]:
#tuning SGDClassifier
clf_A = linear_model.SGDClassifier(loss='hinge', penalty='l1', alpha=0.01, n_iter=100)
train_predict(clf_A, X_train, y_train, X_test, y_test)

Training a SGDClassifier using a training set size of 300. . .
Trained model in 0.0220 seconds
Made predictions in 0.0010 seconds.
F1 score for training set: 0.8194.
Made predictions in 0.0010 seconds.
F1 score for test set: 0.7826.


In [57]:
# TODO: Import 'GridSearchCV' and 'make_scorer'
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV

# TODO: Create the parameters list you wish to tune
param_range= [1e-5,1e-4,1e-3,1e-2,1e-1,1,1e1,1e2,1e3,1e4,1e5]
parameters = [{'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], 'penalty': ['l1','l2','elasticnet'],'alpha':param_range, 'n_iter':param_range[5:8]}
             ]

# TODO: Initialize the classifier
clf = linear_model.SGDClassifier(random_state=1)

# TODO: Make an f1 scoring function using 'make_scorer' 
from sklearn.metrics import f1_score
f1_scorer = make_scorer(f1_score, pos_label="yes")

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(estimator=clf, param_grid=parameters, scoring=f1_scorer, cv=10, n_jobs=-1)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print ("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print ("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))

print ("Parameter 'loss' is {} for the optimal model.".format(clf.get_params()['loss']))
print (grid_obj.best_params_)

Made predictions in 0.0000 seconds.
Tuned model has a training F1 score of 0.8269.
Made predictions in 0.0010 seconds.
Tuned model has a testing F1 score of 0.7838.
Parameter 'loss' is hinge for the optimal model.
{'penalty': 'l1', 'alpha': 0.01, 'n_iter': 100.0, 'loss': 'hinge'}


In [61]:
#tuning KNN
clf_A = neighbors.KNeighborsClassifier(n_neighbors=8, weights='distance', algorithm='brute')
train_predict(clf_A, X_train, y_train, X_test, y_test)

Training a KNeighborsClassifier using a training set size of 300. . .
Trained model in 0.0010 seconds
Made predictions in 0.0050 seconds.
F1 score for training set: 1.0000.
Made predictions in 0.0020 seconds.
F1 score for test set: 0.7482.


In [60]:
# TODO: Import 'GridSearchCV' and 'make_scorer'
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV

# TODO: Create the parameters list you wish to tune
neighbors_range= [1,2,3,4,5,6,7,8,9,10]
parameters = [{'n_neighbors':neighbors_range, 'weights': ['uniform','distance'],'algorithm':['auto','ball_tree','kd_tree','brute']}
             ]

# TODO: Initialize the classifier
clf = neighbors.KNeighborsClassifier()

# TODO: Make an f1 scoring function using 'make_scorer' 
from sklearn.metrics import f1_score
f1_scorer = make_scorer(f1_score, pos_label="yes")

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(estimator=clf, param_grid=parameters, scoring=f1_scorer, cv=10, n_jobs=-1)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print ("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print ("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))

print ("Parameter 'n_neighbors' is {} for the optimal model.".format(clf.get_params()['n_neighbors']))
print (grid_obj.best_params_)

Made predictions in 0.0050 seconds.
Tuned model has a training F1 score of 1.0000.
Made predictions in 0.0030 seconds.
Tuned model has a testing F1 score of 0.7482.
Parameter 'n_neighbors' is 8 for the optimal model.
{'n_neighbors': 8, 'weights': 'distance', 'algorithm': 'brute'}


In [74]:
#tuning BaggingClassifier
clf_A = BaggingClassifier(tree.DecisionTreeClassifier(), n_estimators=10, max_samples=0.5, max_features=0.5)
train_predict(clf_A, X_train, y_train, X_test, y_test)

Training a BaggingClassifier using a training set size of 300. . .
Trained model in 0.0480 seconds
Made predictions in 0.0030 seconds.
F1 score for training set: 0.9440.
Made predictions in 0.0010 seconds.
F1 score for test set: 0.7597.


In [96]:
# TODO: Import 'GridSearchCV' and 'make_scorer'
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV

# TODO: Create the parameters list you wish to tune
estimators_range= [5,10,50]
param_range= [0.5,1.0]
parameters = [{'n_estimators': estimators_range,'max_samples':param_range,'max_features':param_range,'bootstrap':['True','False']}
             ]

# TODO: Initialize the classifier
clf = BaggingClassifier(tree.DecisionTreeClassifier(),random_state=1)

# TODO: Make an f1 scoring function using 'make_scorer' 
from sklearn.metrics import f1_score
f1_scorer = make_scorer(f1_score, pos_label="yes")

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(estimator=clf, param_grid=parameters, scoring=f1_scorer, cv=10, n_jobs=2)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print ("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print ("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))

print ("Parameter 'n_estimators' is {} for the optimal model.".format(clf.get_params()['n_estimators']))
print (grid_obj.best_params_)

Made predictions in 0.0070 seconds.
Tuned model has a training F1 score of 0.9785.
Made predictions in 0.0050 seconds.
Tuned model has a testing F1 score of 0.8148.
Parameter 'n_estimators' is 50 for the optimal model.
{'max_features': 1.0, 'max_samples': 0.5, 'bootstrap': 'True', 'n_estimators': 50}


In [78]:
# TODO: Import 'GridSearchCV' and 'make_scorer'
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV

# TODO: Create the parameters list you wish to tune
estimators_range= [5,10,50]
param_range= [0.5, 1.0]
parameters = [{'n_estimators': estimators_range,'max_samples':param_range,'max_features':param_range,'bootstrap':['True','False']}
             ]

# TODO: Initialize the classifier
clf = BaggingClassifier(neighbors.KNeighborsClassifier(),random_state=1)

# TODO: Make an f1 scoring function using 'make_scorer' 
from sklearn.metrics import f1_score
f1_scorer = make_scorer(f1_score, pos_label="yes")

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(estimator=clf, param_grid=parameters, scoring=f1_scorer, cv=10, n_jobs=-1)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print ("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print ("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))

print ("Parameter 'n_estimators' is {} for the optimal model.".format(clf.get_params()['n_estimators']))
print (grid_obj.best_params_)

Made predictions in 0.1550 seconds.
Tuned model has a training F1 score of 0.8391.
Made predictions in 0.0590 seconds.
Tuned model has a testing F1 score of 0.7867.
Parameter 'n_estimators' is 50 for the optimal model.
{'max_features': 0.5, 'max_samples': 0.5, 'bootstrap': 'True', 'n_estimators': 50}


In [82]:
# TODO: Import 'GridSearchCV' and 'make_scorer'
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV

# TODO: Create the parameters list you wish to tune
estimators_range= [5,10,50]
param_range= [0.5, 1.0]
parameters = [{'n_estimators': estimators_range,'max_samples':param_range,'max_features':param_range,'bootstrap':['True','False']}
             ]

# TODO: Initialize the classifier
clf = BaggingClassifier(linear_model.SGDClassifier(loss='hinge', penalty='l1', alpha=0.01, n_iter=100),random_state=1)

# TODO: Make an f1 scoring function using 'make_scorer' 
from sklearn.metrics import f1_score
f1_scorer = make_scorer(f1_score, pos_label="yes")

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(estimator=clf, param_grid=parameters, scoring=f1_scorer, cv=10, n_jobs=-1)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print ("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print ("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))

print ("Parameter 'n_estimators' is {} for the optimal model.".format(clf.get_params()['n_estimators']))
print (grid_obj.best_params_)

Made predictions in 0.0230 seconds.
Tuned model has a training F1 score of 0.8209.
Made predictions in 0.0070 seconds.
Tuned model has a testing F1 score of 0.7867.
Parameter 'n_estimators' is 50 for the optimal model.
{'max_features': 0.5, 'max_samples': 0.5, 'bootstrap': 'True', 'n_estimators': 50}


In [83]:
# TODO: Import 'GridSearchCV' and 'make_scorer'
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV

# TODO: Create the parameters list you wish to tune
estimators_range= [5,10,50]
param_range= [0.5, 1.0]
parameters = [{'n_estimators': estimators_range,'max_samples':param_range,'max_features':param_range,'bootstrap':['True','False']}
             ]

# TODO: Initialize the classifier
clf = BaggingClassifier(linear_model.LogisticRegression(penalty='l1', C=0.1, tol=1e-5, solver='liblinear'),random_state=1)

# TODO: Make an f1 scoring function using 'make_scorer' 
from sklearn.metrics import f1_score
f1_scorer = make_scorer(f1_score, pos_label="yes")

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(estimator=clf, param_grid=parameters, scoring=f1_scorer, cv=10, n_jobs=-1)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print ("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print ("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))

print ("Parameter 'n_estimators' is {} for the optimal model.".format(clf.get_params()['n_estimators']))
print (grid_obj.best_params_)

Made predictions in 0.0010 seconds.
Tuned model has a training F1 score of 0.8313.
Made predictions in 0.0010 seconds.
Tuned model has a testing F1 score of 0.7945.
Parameter 'n_estimators' is 5 for the optimal model.
{'max_features': 1.0, 'max_samples': 1.0, 'bootstrap': 'True', 'n_estimators': 5}


In [84]:
# TODO: Import 'GridSearchCV' and 'make_scorer'
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV

# TODO: Create the parameters list you wish to tune
estimators_range= [5,10,50]
param_range= [0.5, 1.0]
parameters = [{'n_estimators': estimators_range,'max_samples':param_range,'max_features':param_range,'bootstrap':['True','False']}
             ]

# TODO: Initialize the classifier
clf = BaggingClassifier(svm.SVC(kernel='rbf', C=1, gamma=0.1, decision_function_shape='ovo'),random_state=1)

# TODO: Make an f1 scoring function using 'make_scorer' 
from sklearn.metrics import f1_score
f1_scorer = make_scorer(f1_score, pos_label="yes")

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(estimator=clf, param_grid=parameters, scoring=f1_scorer, cv=10, n_jobs=-1)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print ("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print ("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))

print ("Parameter 'n_estimators' is {} for the optimal model.".format(clf.get_params()['n_estimators']))
print (grid_obj.best_params_)

Made predictions in 0.0450 seconds.
Tuned model has a training F1 score of 0.9035.
Made predictions in 0.0150 seconds.
Tuned model has a testing F1 score of 0.7919.
Parameter 'n_estimators' is 10 for the optimal model.
{'max_features': 0.5, 'max_samples': 1.0, 'bootstrap': 'True', 'n_estimators': 10}


In [92]:
# TODO: Import 'GridSearchCV' and 'make_scorer'
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV

# TODO: Create the parameters list you wish to tune
estimators_range= [5,10,50]
parameters = [{'n_estimators': estimators_range,'criterion':['gini','entropy'],'max_depth':np.arange(1, 11),'max_leaf_nodes':np.arange(2, 11)}
             ]

# TODO: Initialize the classifier
clf = RandomForestClassifier()

# TODO: Make an f1 scoring function using 'make_scorer' 
from sklearn.metrics import f1_score
f1_scorer = make_scorer(f1_score, pos_label="yes")

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(estimator=clf, param_grid=parameters, scoring=f1_scorer, cv=10, n_jobs=2)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train, y_train)

# Get the estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print ("Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf, X_train, y_train)))
print ("Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf, X_test, y_test)))

print ("Parameter 'n_estimators' is {} for the optimal model.".format(clf.get_params()['n_estimators']))
print (grid_obj.best_params_)

Made predictions in 0.0010 seconds.
Tuned model has a training F1 score of 0.8340.
Made predictions in 0.0010 seconds.
Tuned model has a testing F1 score of 0.7785.
Parameter 'n_estimators' is 10 for the optimal model.
{'n_estimators': 10, 'max_leaf_nodes': 6, 'criterion': 'entropy', 'max_depth': 4}
