In [2]:
# Compare Algorithms
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
#from mlxtend.plotting import plot_confusion_matrix
from sklearn.utils import shuffle
import warnings
from playsound import playsound
warnings.filterwarnings("ignore", category=FutureWarning)

# Set seed value
np.random.seed(42)

# Read data
data_file = 'encoded_snps_final.csv'
data = pd.read_csv(data_file, sep = ' ')
data = data.drop('usersid', axis=1)
data = data.replace(np.NaN, "0")
snp = data.columns.values
snp = snp[:-1]
print(snp)
print(snp.shape)
X = data[snp]#.values
Y = data['Class']#.values
print("DONE Reading Data")
playsound('beep-07.mp3')

['rs1000000' 'rs10000023' 'rs1000007' ... 'rs9999853' 'rs9999944'
 'rs999995']
(190655,)
DONE Reading Data


In [116]:
# Training with Random Forest Feature Selection

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV
import sys

#X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.3)

#scaler = StandardScaler()
#scaler.fit(X_train.fillna(0))

sel_ = SelectFromModel(RandomForestClassifier(n_estimators = 100), threshold=0.0009)  
sel_.fit(X, Y)
#print(sel_.get_support())

np.set_printoptions(threshold=sys.maxsize)
f_importance = sel_.estimator_.feature_importances_
selected_f_importance = []
for i in f_importance:
    if i != 0 and i >= 0.0009:
        selected_f_importance.append(i)


selected_feat = X.columns[(sel_.get_support())]

print(len(selected_f_importance))
print(len(selected_feat))

print('total features: {}'.format((X.shape[1])))
print('selected features: {}'.format(len(selected_feat)))


f_df = pd.DataFrame({"feature" : selected_feat, "importance" : selected_f_importance})
f_df.to_csv("features_with_importance3.csv", index=False)
print("done")
#X.to_csv('features_snps.csv', columns=selected_feat)

X_selected = sel_.transform(X)



318
318
total features: 190655
selected features: 318
done


In [121]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import pickle
import sys

np.set_printoptions(threshold=sys.maxsize)
y_pred = []
y_true = []
accuracies = []
recalls = []
precisions = []
counter = 0

# KFold Cross Validation approach
kf = KFold(n_splits=10,shuffle=False)
kf.split(X_selected) 

# Iterate over each train-test split
for train_index, test_index in kf.split(X_selected):
    
    # Split train-test
    X_train, X_test = X_selected[train_index], X_selected[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    
    #model = SVC(kernel='linear', C=1, tol=0.002)
    #model = KNeighborsClassifier(n_neighbors=5)
    #model = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=12)
    #model = LogisticRegression(penalty='l2')
    model = RandomForestClassifier(n_estimators=1000, max_depth=5, max_features=100)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test) 
   
    # model accuracy for X_test   
    accuracy = model.score(X_test, y_test) 
    accuracies.append(accuracy)
    precisions.append(precision_score(y_test, y_pred, average="macro"))
    recalls.append(recall_score(y_test, y_pred, average="macro"))

totalAcc = 0
totalPre = 0
totalRe = 0
for i in accuracies:
    totalAcc = totalAcc + i
for i in precisions:
    totalPre = totalPre + i
for i in recalls:
    totalRe = totalRe + i

print("Acc:")
print(totalAcc/len(accuracies))
print("Precision:")
print(totalPre/len(precisions))
print("Recall:")
print(totalRe/len(recalls))
print("DONE Training")


# Save to file in the current working directory
with open("pickles/pickle_model15.pkl", 'wb') as file:
    pickle.dump(model, file)
    print("DONE Saving the model")

playsound('beep-07.mp3')

Acc:
0.8833333333333334
Precision:
0.8855555555555557
Recall:
0.8949999999999999
DONE Training
DONE Saving the model


In [None]:
# Training without Feature Selection

from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

#model = KNeighborsClassifier(n_neighbors=5)
#model = SVC(kernel='linear', C=1, tol=0.002)
#model = RandomForestClassifier(n_estimators=1000, max_depth=5, max_features=100)
#model = LogisticRegression(penalty='l2')
model = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=12)
scores = cross_validate(model, X, Y, cv=10, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))

f1_score = scores['test_f1_macro'].mean()
acc = scores['test_accuracy'].mean()
recall = scores['test_recall_macro'].mean()
precision = scores['test_precision_macro'].mean()

print("F1:")
print(f1_score)
print()
print("accuracy:")
print(acc)
print()
print("recall:")
print(recall)
print()
print("precision:")
print(precision)
print()
playsound('beep-07.mp3')

In [None]:
# Report Best Hyper-parameters 

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

start = datetime.now()

# Split the dataset in two parts
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.25, stratify=Y)

# Determine best hyperparameters bases on accuracy
grid_param = {
    'n_estimators': [50, 100, 300, 500, 800, 1000],
    'max_depth': [3, 5, 10, 15],
    'max_features': [50, 100, 200, 300, 400]
}

gd_sr = GridSearchCV(estimator=RandomForestClassifier(),
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=10,
                     n_jobs=-1)

gd_sr.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(gd_sr.best_params_)
print()

print("Mean cross-validated score of the best_estimator")
print(gd_sr.best_score_)
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, gd_sr.predict(X_test)
print(classification_report(y_true, y_pred))
print()

print('Run-time', datetime.now() - start) 
playsound('beep-07.mp3')

In [None]:
import numbers
def plot_grid_search_validation_curve(grid, param_to_vary,
                                      title='Validation Curve', ylim=None,
                                      xlim=None, log=None):
    """Plots train and cross-validation scores from a GridSearchCV instance's
    best params while varying one of those params."""

    df_cv_results = pd.DataFrame(grid.cv_results_)
    train_scores_mean = df_cv_results['mean_train_score']
    valid_scores_mean = df_cv_results['mean_test_score']
    train_scores_std = df_cv_results['std_train_score']
    valid_scores_std = df_cv_results['std_test_score']

    param_cols = [c for c in df_cv_results.columns if c[:6] == 'param_']
    param_ranges = [grid.param_grid[p[6:]] for p in param_cols]
    param_ranges_lengths = [len(pr) for pr in param_ranges]

    train_scores_mean = np.array(train_scores_mean).reshape(*param_ranges_lengths)
    valid_scores_mean = np.array(valid_scores_mean).reshape(*param_ranges_lengths)
    train_scores_std = np.array(train_scores_std).reshape(*param_ranges_lengths)
    valid_scores_std = np.array(valid_scores_std).reshape(*param_ranges_lengths)

    param_to_vary_idx = param_cols.index('param_{}'.format(param_to_vary))

    slices = []
    for idx, param in enumerate(grid.best_params_):
        if (idx == param_to_vary_idx):
            slices.append(slice(None))
            continue
        best_param_val = grid.best_params_[param]
        idx_of_best_param = 0
        if isinstance(param_ranges[idx], np.ndarray):
            idx_of_best_param = param_ranges[idx].tolist().index(best_param_val)
        else:
            idx_of_best_param = param_ranges[idx].index(best_param_val)
        slices.append(idx_of_best_param)

    train_scores_mean = train_scores_mean[tuple(slices)]
    valid_scores_mean = valid_scores_mean[tuple(slices)]
    train_scores_std = train_scores_std[tuple(slices)]
    valid_scores_std = valid_scores_std[tuple(slices)]

    plt.clf()

    plt.title(title)
    plt.xlabel(param_to_vary)
    plt.ylabel('Score')

    if (ylim is None):
        plt.ylim(0.0, 1.1)
    else:
        plt.ylim(*ylim)

    if (not (xlim is None)):
        plt.xlim(*xlim)

    lw = 2

    plot_fn = plt.plot
    if log:
        plot_fn = plt.semilogx

    param_range = param_ranges[param_to_vary_idx]
    if (not isinstance(param_range[0], numbers.Number)):
        param_range = [str(x) for x in param_range]
    plot_fn(param_range, train_scores_mean, label='Training score', color='r',
            lw=lw)
    plt.fill_between(param_range, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color='r', lw=lw)
    plot_fn(param_range, valid_scores_mean, label='Cross-validation score',
            color='b', lw=lw)
    plt.fill_between(param_range, valid_scores_mean - valid_scores_std,
                     valid_scores_mean + valid_scores_std, alpha=0.1,
                     color='b', lw=lw)

    plt.legend(loc='lower right')

    #plt.show()
    plt.savefig(param_to_vary+'_v2.png', dpi=300)

# Determine best hyperparameters bases on accuracy
grid_param = {
    'criterion': ['entropy', 'gini'],
    'splitter': ['best', 'random'],
    'max_depth': [4,6,8,12]
}
# Split the dataset in two parts
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.25, stratify=Y)
gd_sr = GridSearchCV(estimator=DecisionTreeClassifier(),
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=10,
                     n_jobs=-1,
                    return_train_score=True)
gd_sr.fit(X_train, y_train)
df_cv_results = pd.DataFrame(gd_sr.cv_results_)
print(df_cv_results)
plot_grid_search_validation_curve(gd_sr, 'criterion')
plot_grid_search_validation_curve(gd_sr, 'splitter')
plot_grid_search_validation_curve(gd_sr, 'max_depth')