In [1]:
import sys, os
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import numpy as np
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import cross_validation
from scipy.stats import randint as sp_randint
from sklearn import svm
from time import time
from operator import itemgetter
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn import preprocessing
from sklearn.model_selection import train_test_split



In [2]:
trainingSetDir = "/home/kevin/LOTTERHOS_LAB/TTT_RecombinationGenomeScans/ml_project/feature_vecs"

In [3]:
classifierPickleFileName = "clfAcrossGenomeSMOTE1000.p"
statsToUse = "all"
classList = []
trainingData = []
labelToClassName = {}
headerH = {}

for trainingSetFileName in os.listdir(trainingSetDir):
    classList.append(trainingSetFileName.split(".fvec")[0])
    trainingSetFile = open(trainingSetDir + "/" + trainingSetFileName)
    currTrainingData = trainingSetFile.readlines()
    trainingSetFile.close()

    trainingData += currTrainingData[1:]#append all training data from the current set (minus the header)

    currLabelH = {}
    for example in currTrainingData[1:]:
        currLabelH[example.split("\t")[0]] = 1
    assert len(currLabelH) == 1
    labelToClassName[currLabelH.keys()[0]] = trainingSetFileName.split(".fvec")[0]
    
    header = currTrainingData[0].strip().split("\t")
    headerH[currTrainingData[0].strip()] = 1
    assert header[0] == "classLabel"
    statIndices = []
    if "all" in statsToUse:
        statIndices = range(1, len(header))
    else:
        for i in range(1, len(header)):
            if header[i] in statsToUse or header[i].split("_win")[0] in statsToUse:
                statIndices.append(i)
assert len(headerH) == 1

sys.stderr.write("using these features: %s (indices: %s)\n" %(str(statsToUse), str(statIndices)))
XH = {}
for i in range(len(trainingData)):
    trainingData[i] = trainingData[i].strip().split("\t")
    currVector = []
    if not "nan" in trainingData[i]:
        for j in statIndices:
            currVector.append(float(trainingData[i][j]))
        assert len(currVector) == len(statIndices)
        if not XH.has_key(trainingData[i][0]):
            XH[trainingData[i][0]] = []
        XH[trainingData[i][0]].append(currVector)

using these features: all (indices: [1, 2, 3, 4, 5, 6, 7, 8, 9])


In [7]:
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
#balance the training set
# minClassSize = min([len(XH[classLabel]) for classLabel in  XH.keys()])
X = []
y = []
for classLabel in sorted(XH.keys()):
    random.shuffle(XH[classLabel])
    for i in range(1000):
        try:
            currVector = XH[classLabel][i]
        except IndexError:
            break
        X.append(currVector)
        y.append(classLabel)
        
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
sys.stderr.write("Training set size after split: %s\n" %(len(y_train)))
sys.stderr.write("Testing set size: %s\n" %len(y_test))

# SMOTE:
X_resampled, y_resampled = SMOTE().fit_sample(X_train, y_train)
sys.stderr.write("training set size after balancing: %s\n" %(len(y_resampled)))

Training set size after split: 4079
Testing set size: 1360
training set size after balancing: 5257


In [17]:
for classLabel in XH.keys():
    print(classLabel)

MT=neut_R=invers
MT=neut_R=BS
MT=sweep_R=neutral
MT=neut_R=lowRC
MT=QTN_R=neutral
MT=delet_R=BS
MT=neut_R=neutral


In [5]:
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [8]:
sys.stderr.write("Checking accuracy when distinguishing among all %s classes\n" %(len(XH.keys())))

maxMaxFeatures = len(X[0])
param_grid_forest = {"max_depth": [3, 10, None],
              "max_features": [1, 3, int(maxMaxFeatures**0.5), maxMaxFeatures],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

clf, mlType, paramGrid = ExtraTreesClassifier(n_estimators=100), "extraTreesClassifier", param_grid_forest

heatmap = []
sys.stderr.write("Training %s\n" %(mlType))
grid_search = GridSearchCV(clf,param_grid=param_grid_forest,cv=10,n_jobs=-1)
start = time()
grid_search.fit(X_resampled, y_resampled)
sys.stderr.write("GridSearchCV took %.2f seconds for %d candidate parameter settings.\n"
      % (time() - start, len(grid_search.grid_scores_)))
print "Results for %s" %(mlType)
report(grid_search.grid_scores_)
joblib.dump((X_test, y_test, grid_search), classifierPickleFileName)

Checking accuracy when distinguishing among all 7 classes
Training extraTreesClassifier
GridSearchCV took 662.44 seconds for 432 candidate parameter settings.


Results for extraTreesClassifier
Model with rank: 1
Mean validation score: 0.751 (std: 0.019)
Parameters: {'bootstrap': False, 'min_samples_leaf': 1, 'min_samples_split': 3, 'criterion': 'entropy', 'max_features': 3, 'max_depth': None}

Model with rank: 2
Mean validation score: 0.748 (std: 0.015)
Parameters: {'bootstrap': False, 'min_samples_leaf': 1, 'min_samples_split': 3, 'criterion': 'gini', 'max_features': 3, 'max_depth': None}

Model with rank: 3
Mean validation score: 0.747 (std: 0.020)
Parameters: {'bootstrap': False, 'min_samples_leaf': 1, 'min_samples_split': 2, 'criterion': 'entropy', 'max_features': 9, 'max_depth': None}



['clfAcrossGenomeSMOTE1000.p']