In [12]:
import os
import struct
import random
import math
import numpy as np
import pandas as pd
import tensorflow as tf
import pydot
import pydotplus
import graphviz
import scipy
import scipy.stats as sts
from scipy.stats import norm
import numpy.random as nrnd
from sklearn import metrics
from keras import regularizers
from keras.layers import Dense, Activation, Input, Dropout, BatchNormalization
from keras.models import Sequential
from keras_tqdm import TQDMNotebookCallback as ktqdm
from tqdm import tqdm_notebook as tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from statsmodels.distributions.empirical_distribution import ECDF
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import precision_recall_curve, average_precision_score, confusion_matrix

## Reproducibility

In [2]:
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(42)
random.seed(42)
tf.set_random_seed(42)

## Reading csv files

In [3]:
path_classes = "/home/tore/venv/homes.di.unimi.it/classes/GM12878.csv"
path_epigenomic = "/home/tore/venv/homes.di.unimi.it/epigenomic-data/GM12878.csv"
path_sequences = "/home/tore/venv/homes.di.unimi.it/sequences/GM12878.csv"

classes = pd.read_csv(path_classes, names = ["Titles"])
epigenomic = pd.read_csv(path_epigenomic, sep=",")

In [4]:
data = (classes.join(epigenomic))
data = data.drop("Unnamed: 0", axis=1)

In [5]:
data = data[(data["Titles"] == "A-P") | (data["Titles"] == "I-P")]

In [6]:
ap = data[data["Titles"] == "A-P"]
ip = data[data["Titles"] == "I-P"]
len(ap), len(ip)

(10816, 73891)

## Creating Training and Testing Sets

    Since I-Ps are seven times more circa than A-Ps, a more fair way to use datas is needed.
    Ideas:
        > Under-sampling
        > Keeping training ratio 7:1
        
    Use numpy to convert each row of the frame in a np.array
    
    Dataframe hase shape (10816, 102) so, concerning the structure of the network:
        > 101 input neurons for the 0-column holds the labels
        > 1 output neuron
        
### Under-sampling attempt

In [59]:
ip_under = (shuffle(ip))[:len(ap)]

In [60]:
ap_arr, ip_arr = ap.values, ip_under.values

    A-Ps and I-Ps should now be cropped together in order to have the full training and testing set

In [63]:
x_ip, y_ip = ip_under.values[:, ip_under.columns != "Titles"], ip_under.values[:, ip_under.columns == "Titles"]
x_ap, y_ap = ap.values[:, ap.columns != "Titles"], ap.values[:, ap.columns == "Titles"]

In [66]:
x_set, y_set = np.concatenate((x_ap, x_ip)), np.concatenate((y_ap, y_ip))

In [67]:
x_train, x_test, y_train, y_test = train_test_split(
    x_set, 
    y_set, 
    test_size=0.3,
    random_state=42,
    shuffle=True)

In [70]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((15142, 101), (6490, 101), (15142, 1), (6490, 1))

### 7:1 Ratio attempt