In [1]:
import sys, os, random
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import model_selection, preprocessing
from scipy.stats import randint as sp_randint
from sklearn import svm
from time import time
from operator import itemgetter
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split

In [2]:
trainingSetDir = "/media/kevin/TOSHIBA_EXT/TTT_RecombinationGenomeScans/ml_project/feature_vecs_all_SK-A"
classifierPickleFileName = "all_sk-a_stats.p"
statsToUse = "all"
print(os.listdir(trainingSetDir))

['MT-delet_R-BS.fvec', 'MT-lgQTN_R-lgQTNlink.fvec', 'MT-neut_R-BS.fvec', 'MT-neut_R-FarSS.fvec', 'MT-neut_R-invers.fvec', 'MT-neut_R-lgQTNlink.fvec', 'MT-neut_R-lowRC.fvec', 'MT-neut_R-NearSS.fvec', 'MT-neut_R-neutral.fvec', 'MT-neut_R-smQTNlink.fvec', 'MT-smQTN_R-smQTNlink.fvec', 'MT-sweep_R-NearSS.fvec']


In [3]:
classList = []
trainingData = []
labelToClassName = {}
headerH = {}

for trainingSetFileName in os.listdir(trainingSetDir):
    classList.append(trainingSetFileName.split(".fvec")[0])
    trainingSetFile = open(trainingSetDir + "/" + trainingSetFileName)
    currTrainingData = trainingSetFile.readlines()
    trainingSetFile.close()

    trainingData += currTrainingData[1:]#append all training data from the current set (minus the header)

    currLabelH = {}
    for example in currTrainingData[1:]:
        currLabelH[example.split("\t")[0]] = 1
    assert len(currLabelH) == 1, "Length: %d  label: %s" %(len(currLabelH), currLabelH)
    labelToClassName[list(currLabelH.keys())[0]] = trainingSetFileName.split(".fvec")[0]
    
    header = currTrainingData[0].strip().split("\t")
    headerH[currTrainingData[0].strip()] = 1
    assert header[0] == "classLabel"
    statIndices = []
    if "all" in statsToUse:
        statIndices = range(1, len(header))
    else:
        for i in range(1, len(header)):
            if header[i] in statsToUse or header[i].split("_win")[0] in statsToUse:
                statIndices.append(i)
assert len(headerH) == 1

sys.stderr.write("using these features: %s (indices: %s)\n" %(str(statsToUse), str(statIndices)))
XH = {}
for i in range(len(trainingData)):
    trainingData[i] = trainingData[i].strip().split("\t")
    currVector = []
    if not "nan" in trainingData[i]:
        for j in statIndices:
            try:
                currVector.append(float(trainingData[i][j]))
            except:
                print("Invalid data at coordinates: " + str(i) + "," + str(j))
                surr = trainingData[i-1 : i + 5]
                for site in surr: print(site)
                print(trainingData[i])
                print(header[j])
        assert len(currVector) == len(statIndices), \
        "length of current vector: %s doesn't match length of stat indices: %s" %(len(currVector), len(statIndices))
        if trainingData[i][0] not in XH:
            XH[trainingData[i][0]] = []
        XH[trainingData[i][0]].append(currVector)

using these features: all (indices: range(1, 23))


In [4]:
X = []
y = []
for classLabel in sorted(XH.keys()):
    print('{:24} : {:>6}'.format(classLabel, str(len(XH[classLabel]))))
    random.shuffle(XH[classLabel])
    for i in range(1000):
        try:
            currVector = XH[classLabel][i]
        except IndexError:
            break
        X.append(currVector)
        y.append(classLabel)
        
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
sys.stderr.write("Training set size after split: %s\n" %(len(y_train)))
sys.stderr.write("Testing set size: %s\n" %len(y_test))


MT=delet_R=BS            :    379
MT=lgQTN_R=lgQTNlink     :     86
MT=neut_R=BS             :  31197
MT=neut_R=FarSS          :    585
MT=neut_R=NearSS         :    983
MT=neut_R=invers         :  11920
MT=neut_R=lgQTNlink      :    502
MT=neut_R=lowRC          :   8644
MT=neut_R=neutral        : 312608
MT=neut_R=smQTNlink      :   3511
MT=smQTN_R=smQTNlink     :    639
MT=sweep_R=NearSS        :     53


Training set size after split: 6170
Testing set size: 2057


In [5]:
print(XH.keys())
labels = XH.keys()

dict_keys(['MT=delet_R=BS', 'MT=lgQTN_R=lgQTNlink', 'MT=neut_R=BS', 'MT=neut_R=FarSS', 'MT=neut_R=invers', 'MT=neut_R=lgQTNlink', 'MT=neut_R=lowRC', 'MT=neut_R=NearSS', 'MT=neut_R=neutral', 'MT=neut_R=smQTNlink', 'MT=smQTN_R=smQTNlink', 'MT=sweep_R=NearSS'])


In [6]:
# Utility function to report best scores
def report(grid_scores, n_top=3):
    scores = pd.DataFrame(grid_scores)
    top_scores = scores.sort_values(by = ['mean_test_score'], ascending = False).iloc[:n_top]
    
    for i in range(len(top_scores)):
        score = top_scores.iloc[i]
        print("Model with rank: {0}".format(i + 1))
        print("Mean test score: {0:.3f} (std: {1:.3f})".format(
              score.mean_test_score,
              score.std_test_score))
        print("Parameters: {0}".format(score.params))
        print("")

In [7]:
sys.stderr.write("Checking accuracy when distinguishing among all %s classes\n" %(len(XH.keys())))

maxMaxFeatures = len(X[0])
param_grid_forest = {"max_depth": [3, 10, None],
              "max_features": [1, 3, int(maxMaxFeatures**0.5), maxMaxFeatures],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

clf, mlType, paramGrid = ExtraTreesClassifier(n_estimators=100), "extraTreesClassifier", param_grid_forest

heatmap = []
sys.stderr.write("Training %s\n" %(mlType))
grid_search = GridSearchCV(clf, param_grid=param_grid_forest, cv=10,n_jobs=-1, return_train_score = False)
start = time()
grid_search.fit(X_train, y_train)
sys.stderr.write("GridSearchCV took %.2f seconds for %d candidate parameter settings.\n"
      % (time() - start, len(grid_search.cv_results_)))
print("Results for %s" %(mlType))
report(grid_search.cv_results_)
joblib.dump((X_test, y_test, grid_search, list(XH.keys())), classifierPickleFileName)

Checking accuracy when distinguishing among all 12 classes
Training extraTreesClassifier
GridSearchCV took 1096.20 seconds for 24 candidate parameter settings.


Results for extraTreesClassifier
Model with rank: 1
Mean test score: 0.552 (std: 0.016)
Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 22, 'min_samples_leaf': 1, 'min_samples_split': 3}

Model with rank: 2
Mean test score: 0.552 (std: 0.012)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 22, 'min_samples_leaf': 1, 'min_samples_split': 3}

Model with rank: 3
Mean test score: 0.551 (std: 0.011)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 22, 'min_samples_leaf': 3, 'min_samples_split': 2}



['all_sk-a_stats.p']

In [48]:
report(grid_search.cv_results_)

Model with rank: 1
Mean test score: 0.559 (std: 0.018)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}

Model with rank: 2
Mean test score: 0.557 (std: 0.018)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 20, 'min_samples_leaf': 1, 'min_samples_split': 3}

Model with rank: 3
Mean test score: 0.557 (std: 0.015)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 20, 'min_samples_leaf': 3, 'min_samples_split': 3}

