In [5]:
#import matplotlib
#import numpy as np
#import pandas as pd
#import seaborn as sns
#import matplotlib.pyplot as plt
#%matplotlib inline
#sns.set()

## Function Definition

In [6]:
from sklearn.learning_curve import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and traning learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects
    """
    
    plt.figure()
    train_sizes,train_scores,test_scores = learning_curve(estimator,X,y,cv=None,n_jobs=-1,train_sizes=train_sizes)
    train_scores_mean= np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std  = np.std(test_scores, axis=1)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.legend(loc="best")
    plt.grid("on") 
    if ylim:
        plt.ylim(ylim)
    plt.title(title)

In [7]:
from sklearn.cross_validation import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, f1_score

def print_result(estimator, n, X1, y1, X2, y2):

    acc_scrs = cross_val_score(estimator, X1, y1, cv=StratifiedKFold(y1, n_folds=n), n_jobs=-1)
    auc_scrs = cross_val_score(estimator, X1, y1, cv=StratifiedKFold(y1, n_folds=n), n_jobs=-1, scoring="roc_auc")

    print("Accuracy on Train: %0.5f (+/- %0.5f)" %(np.mean(acc_scrs),np.std(acc_scrs)))
    print(" ROC_AUC on Train: %0.5f (+/- %0.5f)" %(np.mean(auc_scrs),np.std(auc_scrs)))

    estimator_fitted = estimator.fit(X1, y1)
    y_scr = estimator_fitted.predict(X2)

    print
    print "Confusion Matrix on Test:"
    print confusion_matrix(y2, y_scr)
    print
    print "Classification Report on Test:"
    print classification_report(y2, y_scr)
    print "ROC_AUC on Test: %0.5f" %(roc_auc_score(y2, y_scr))

## Data Investigation

In [8]:
#df_trn = pd.read_csv("train.csv")
#df_tst = pd.read_csv("test.csv")

#trn_X = df_trn.loc[:,"coli":"colj"].drop_duplicates()
#trn_ix = trn_X.index
#df_trn = df_trn.ix[trn_ix]

#(r, c)= df_trn.shape
#index = pd.Index(range(r))
#df_trn= df_trn.set_index(index)

#cs_l = []
#for i in range(r):
#    cs_l.append("Train")

#cs_a = np.asarray(cs_l)
#cs_s = pd.DataFrame(cs_a, columns=["Case"], dtype="object")
#df_trn = pd.concat([cs_s, df_trn], axis=1)

#y = df_trn["TARGET"].values
#df_trn.drop("TARGET", axis=1, inplace=True)

#(r, c) = df_tst.shape
#cs_l = []
#for i in range(r):
#    cs_l.append("Test")

#cs_a = np.asarray(cs_l)
#cs_s = pd.DataFrame(cs_a, columns=["Case"], dtype="object")
#df_tst = pd.concat([cs_s, df_tst], axis=1)

#df = pd.concat([df_trn, df_tst], axis=0) # top/bottom

#(r, c)= df.shape
#index = pd.Index(range(r))
#df = df.set_index(index)

In [9]:
#df.info()
#df.head(n=3)
#df.describe()
#df.axes
#df.dtypes
#df.empty
#df.ndim
#(r,c) = df.shape
#df.values[i,j]

## Duplicate and NaN check

In [10]:
#if(len(df) == len(df.drop_duplicates())):
#    print "There is no dublicated rows"
#else:
#    print "There is dublicated rows !!"

In [11]:
#df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
#df.isnull().sum().sum()

## Converting the "object" to "integer" dtype  

##### Converting the columns with "object" to "category" dtype.

In [12]:
#objcols = df.select_dtypes(["object"]).columns
#for i in range(len(objcols)):
#    df[objcols[i]] = df[objcols[i]].astype("category")

##### Converting the columns with "category" to "integer" dtype.

In [13]:
#catcols = df.select_dtypes(["category"]).columns
#df[catcols] = df[catcols].apply(lambda x: x.cat.codes)

## Removing the high correlated columns

In [14]:
#corr_mat = df.corr()

#col_names = df.columns
#ncol = len(col_names)
#null_cols = []

#for i in range(ncol-1):
#    for j in range(i+1, ncol):
#        if (abs(corr_mat.ix[i,j]) > 0.70):
#            null_cols.append(col_names[j])

#unique_cols = np.unique(null_cols)
#df.drop(unique_cols, axis=1, inplace=True)

## Visualization

In [15]:
#sns.pairplot(df, kind="scatter", diag_kind='hist')
#plt.show()

## kind: 'scatter', 'reg'
## diag_kind: 'hist', 'kde'

In [16]:
#plt.figure(figsize=(16, 12))
#sns.corrplot(df.ix[:,1:])
#plt.show()

## Preprocessing

In [17]:
#from sklearn import preprocessing

#df_X = df.loc[:, 'col1':'col2']
#ar_Xs = preprocessing.scale(df_X.values)
#df_Xs = pd.DataFrame(ar_Xs, columns=list(df_X.columns))

#dfs = df
#dfs.ix[:, 'col1':'col2'] = df_Xs

#dfs_trn = dfs[dfs["Case"]=="Train"]
#dfs_tst = dfs[dfs["Case"]=="Test"]

#ar_trn_Xs = dfs_trn.loc[:, 'col1':'col2'].values
#ar_tst_Xs = dfs_tst.loc[:, 'col1':'col2'].values

#df_trn_Xs = pd.DataFrame(ar_trn_Xs, columns=list(df_X.columns))
#df_tst_Xs = pd.DataFrame(ar_tst_Xs, columns=list(df_X.columns))

## Feature Selection: SelectFpr

In [18]:
#from sklearn.feature_selection import SelectFpr

#slct = SelectFpr(alpha = 1e-3)
#slct_trn = slct.fit_transform(df_trn_Xs, y)

#print (slct_trn.shape)

#cols2stay = slct.get_support(indices = True)
#columns = df_trn_Xs.columns

#cols2rm = []
#for i in range(len(columns)):
#    if (i not in cols2stay):
#        cols2rm.append(columns[i])

#X = df_trn_Xs.drop(cols2rm, axis=1).values
#X_val = df_tst_Xs.drop(cols2rm, axis=1).values

## Feature Selection: SelectFromModel

In [19]:
#from sklearn.feature_selection import SelectFromModel

#sfm = SelectFromModel(clf, threshold=0.25)
#slct_trn = sfm.fit_transform(df_trn_Xs, y)

#print (slct_trn.shape)

#cols2stay = slct.get_support(indices = True)
#columns = df_trn_Xs.columns

#cols2rm = []
#for i in range(len(columns)):
#    if (i not in cols2stay):
#        cols2rm.append(columns[i])

#X = df_trn_Xs.drop(cols2rm, axis=1).values
#X_val = df_tst_Xs.drop(cols2rm, axis=1).values

## Splitting data to the training and test datasets

In [20]:
#from sklearn.cross_validation import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.xx)

## 1.  Simple ML  Procedure

In [21]:
#from sklearn.svm import LinearSVC                   # Support Vector Machines without kernels based on liblinear
#from sklearn.linear_model import LogisticRegression # Regularized Logistic Regression based on liblinear
#from sklearn.linear_model import SGDClassifier      # Regularized linear models (SVM or logistic regression) using
                                                     # a Stochastic Gradient Descent algorithm written in Cython

#from sklearn.neighbors import KNeighborsClassifier  # k-Nearest Neighbors classifier based on the ball tree
                                                     # datastructure for low dimensional data and brute force
                                                     # search for high dimensional data.

#from sklearn.naive_bayes import GaussianNB          # Gaussian Naive Bayes model. This is an unsophisticated
                                                     # model which can be trained very quickly. It is often used
                                                     # to obtain baseline results before moving to a more
                                                     # sophisticated classifier.

#from sklearn.tree import DecisionTreeClassifier     # A classifier based on a series of binary decisions. This is
                                                     # another very fast classifier, which can be very powerful.
#clf = LinearSVC()

###### 1.1. Plotting the learning curve

In [22]:
#plot_learning_curve(clf,"accuracy vs. training set size", X_train, 
#                                                          y_train,
#                                                           cv = 5,
#                                                  train_sizes = np.linspace(0.1,1.0,9))
#plt.show()

##### 1.2. Printing the results

In [23]:
#print_result(clf, 5, X_train, y_train, X_test, y_test)

## 2. More Complex ML Procedure

##### 2.1. Making a pipeline 

In [24]:
#from sklearn.pipeline import Pipeline

# with pipeline, we can put together quncequen functions to make a classifier as below, 
# clf_pipeline = Pipeline([('name 1', function 1), ('name 2', function 2), ('clf_nm', clf)])

##### 2.2. Making GridSearch 

In [25]:
#from sklearn.grid_search import GridSearchCV
#from sklearn.cross_validation import StratifiedKFold

#Cs = (np.linspace(1, 100, num=4)).tolist()
#gammas = np.linspace(0.0001, 0.001, num = 3).tolist()

#clf_param_grid = [{'clf__C': Cs, 'clf__kernel': ['linear']},
#                  {'clf__C': Cs, 'clf__kernel': ['rbf'], 'clf__gamma': gammas}]

#clf_grid = GridSearchCV(clf_pipeline,
#           param_grid = clf_param_grid,
#                refit = True,
#               n_jobs = -1,
#              scoring = 'accuracy',
#                   cv = StratifiedKFold(y_train, n_folds=5))

##### 2.3. Training the algorithm

In [26]:
#clf_label_detector = clf_grid.fit(X_train, y_train)

In [27]:
#for i in range(len(clf_label_detector.grid_scores_)):
#    print clf_label_detector.grid_scores_[i]

##### 2.4. Aplying to the Test data

In [28]:
#print confusion_matrix(y_test, clf_label_detector.predict(X_test))

In [29]:
#print classification_report(y_test, clf_label_detector.predict(X_test)).splitlines()[0]
#print classification_report(y_test, clf_label_detector.predict(X_test)).splitlines()[-1]

##### 2.5. Saving the classifier

In [30]:
#import pickle

#with open("ClasifierName.pkl", "wb") as fout:
#    pickle.dump(clf_final_detector, fout)

##### 2.6. Loading the classifier

In [31]:
#ClasifierName_loaded = pickle.load(open("ClasifierName.pkl", "rb"))