In [1]:
from sklearn.metrics import r2_score
def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    
    # TODO: Calculate the performance score between 'y_true' and 'y_predict'
    score = r2_score(y_true, y_predict)
    
    # Return the score
    return score

In [3]:
from sklearn.cross_validation import ShuffleSplit
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV
def fit_model(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    
    # Create cross-validation sets from the training data
    #print X.shape[0]
    cv_sets = ShuffleSplit(X.shape[0], n_iter=10, test_size = 0.20, random_state = 0)
    
    #n_samples, n_iter=3,test_size=0.3, random_state=0

    # TODO: Create a decision tree regressor object
    regressor = DecisionTreeRegressor()

    # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth':range(1,100)}

    # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)

    # TODO: Create the grid search object
    grid = GridSearchCV(regressor,params,cv=cv_sets,scoring=scoring_fnc)

    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)

    # Return the optimal model after fitting the data
    return grid.best_estimator_

In [4]:
import matplotlib.pyplot as plt
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [5]:
def calculate_confusion_matrix(y_test, y_pred):
    return confusion_matrix(y_test, y_pred)

In [12]:
from sklearn.ensemble import RandomForestClassifier

def implement_rfc(X_train,y_train,X_test,number_of_estimators=10,max_depth=None, 
                  minimum_samples_split=2,minimum_samples_leaf=1,random_number=42):
    """
    This function fits and transforms data using 
    Random Forest Classifier technique and 
    returns the y_pred value
    """
    clf = RandomForestClassifier(n_estimators=number_of_estimators,min_samples_split=minimum_samples_split,
                                  min_samples_leaf=minimum_samples_leaf,random_state=random_number)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

In [32]:
#Finding optimum estimator in case of RFC
#reference - https://matthew-nm.github.io/pages/projects/gender04_content.html

def calculate_optimum_estimator_rfc(X_train,y_train,X_test,y_test,interval=5):
    error_rate = []
    nvals = range(1,800,interval) # test a range of total trees to aggregate

    for i in nvals:
        rfc = RandomForestClassifier(n_estimators=i)
        rfc.fit(X_train,y_train)
        y_pred_i = rfc.predict(X_test)
        error_rate.append(np.mean(y_pred_i != y_test))



    plt.plot(nvals, error_rate, color='blue', linestyle='dashed', marker='o',
             markerfacecolor='red', markersize=10)
    plt.title('Error Rate vs. Number of Predictors')
    plt.xlabel('Number of Predictors')
    plt.ylabel('Error Rate')

    # Determine location of best performance
    nloc = error_rate.index(min(error_rate))
    print('Lowest error of %s occurs at n=%s.' % (error_rate[nloc], nvals[nloc]))
    return nvals[nloc]

In [28]:
from sklearn.metrics import mean_squared_error
from sklearn import ensemble
def implement_gbr(X_train,y_train,X_test,y_test,estimators=500,maximum_depth=4,
                  minimum_samples_split=2,learning_rate=0.01,loss='ls'):
    params = {'n_estimators': estimators, 'max_depth': maximum_depth, 'min_samples_split': minimum_samples_split,
              'learning_rate': learning_rate, 'loss': loss}
    clf_gbr = ensemble.GradientBoostingRegressor(**params)
    clf_gbr.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf_gbr.predict(X_test))
    print "MSE: %.4f" % mse
    
    test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

    for i, y_pred in enumerate(clf_gbr.staged_predict(X_test)):
        test_score[i] = clf_gbr.loss_(y_test, y_pred)

    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.title('Deviance')
    plt.plot(np.arange(params['n_estimators']) + 1, clf_gbr.train_score_, 'b-',
             label='Training Set Deviance')
    plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
             label='Test Set Deviance')
    plt.legend(loc='upper right')
    plt.xlabel('Boosting Iterations')
    plt.ylabel('Deviance')
    
    feature_importance = clf_gbr.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    plt.subplot(1, 2, 2)
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    #plt.yticks(pos, X_train.feature_names[sorted_idx])
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')
    plt.show()

In [31]:
from sklearn.metrics import make_scorer,fbeta_score, accuracy_score
from sklearn import grid_search
from sklearn import linear_model
def implement_lr(X_train,y_train,X_test,y_test):
    clf_lr = linear_model.LogisticRegression(random_state=42)
    parameters = { 'C': [1, 10, 100, 1000,10000],'solver': ['newton-cg','lbfgs','liblinear','sag'],
              'class_weight':['balanced',None]}
    scorer = make_scorer(fbeta_score, beta=0.5)
    grid_obj = grid_search.GridSearchCV(clf_lr, parameters,scoring=scorer)
    grid_fit = grid_obj.fit(X_train,y_train)
    best_clf = grid_fit.best_estimator_
    predictions = (clf.fit(X_train, y_train)).predict(X_test)
    best_predictions = best_clf.predict(X_test)
    # Report the before-and-afterscores
    print "Unoptimized model\n------"
    print "Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions))
    print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5))
    print "\nOptimized Model\n------"
    print "Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))
    print "Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5))

    #reference - http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
    
    
